fix: analyse locale CI failing with worker pool

pull/3206/head
Audric Ackermann 7 months ago
parent 667bbfb398
commit 919abc03b6

@ -21,369 +21,373 @@ from localization.regex import localization_regex_as_list
from util.fileUtils import makeDirIfNotExists, removeFileIfExists
from util.logger import console
parser = argparse.ArgumentParser()
parser.add_argument(
"--debug", action="store_true", help="Enable debug mode, print debug messages"
)
parser.add_argument(
"--output-dir",
type=str,
default="./tools/localization/analysis",
help="Output directory for the results",
)
parser.add_argument(
"--write-found-to-file",
action="store_true",
help="Write the found strings to a file",
)
parser.add_argument(
"--write-not-found-to-file",
action="store_true",
help="Write the not found strings to a file",
)
parser.add_argument(
"--print-not-found",
action="store_true",
help="Print the not found strings",
)
parser.add_argument(
"--identify-found-in-files",
action="store_true",
help="Identify line-numbers using regex.",
)
parser.add_argument(
"--identify-line-numbers",
action="store_true",
help="Identify line-numbers using regex.",
)
parser.add_argument(
"--disable-concurrency",
action="store_true",
help="Disable multiprocessing concurrency.",
)
parser.add_argument(
"--find-potential-matches",
action="store_true",
help="Find potential matched strings using very lazy regex.",
)
parser.add_argument(
"--delete-unused-keys",
action="store_true",
help="Delete unused keys."
)
args = parser.parse_args()
# Configuration
DEBUG = args.debug
CONCURRENCY_ENABLED = not args.disable_concurrency
if CONCURRENCY_ENABLED and (args.identify_found_in_files or args.identify_line_numbers):
CONCURRENCY_ENABLED = False
console.info(f"Concurrency is disabled when --identify-found-in-files or --identify-line-numbers is used")
if CONCURRENCY_ENABLED:
console.info(f"Concurrency enabled. Use --disable-concurrency to disable concurrency.")
console.enableDebug() if DEBUG else None
OUTPUT_DIR = args.output_dir
FOUND_STRINGS_PATH = os.path.join(OUTPUT_DIR, "found_strings.csv")
NOT_FOUND_STRINGS_PATH = os.path.join(OUTPUT_DIR, "not_found_strings.txt")
POTENTIAL_MATCHES_PATH = os.path.join(OUTPUT_DIR, "potential_matches.csv")
NOT_IN_MASTER_LIST_PATH = os.path.join(OUTPUT_DIR, "not_in_master_list.csv")
EN_PATH = "_locales/en/messages.json"
# Remove files that are to be generated if they exist
removeFileIfExists(FOUND_STRINGS_PATH)
removeFileIfExists(NOT_FOUND_STRINGS_PATH)
removeFileIfExists(POTENTIAL_MATCHES_PATH)
removeFileIfExists(NOT_IN_MASTER_LIST_PATH)
def flush():
sys.stdout.flush() if not DEBUG else None
# File search setup
console.info("Scanning for localized strings...")
files_to_ignore = ["./ts/localization/locales.ts"]
ignore_patterns = [re.compile(re.escape(pattern)) for pattern in files_to_ignore]
console.debug(f"Ignoring files: {', '.join(files_to_ignore)}")
def should_ignore_file(path):
return any(pattern.search(path) for pattern in ignore_patterns)
def find_files_with_extension(root_dir, extensions):
for entry in os.scandir(root_dir):
if entry.is_dir():
yield from find_files_with_extension(entry.path, extensions)
elif any(entry.name.endswith(ext) for ext in extensions) and not should_ignore_file(entry.path):
yield entry.path
os_walk_time_start = time.perf_counter()
files = set(find_files_with_extension("./ts/", (".ts", ".tsx")))
files.update(
[
y
for x in os.listdir("./")
for y in glob.glob(os.path.join(x[0], "*preload.js"))
if not should_ignore_file(y)
]
)
os_walk_time_end = time.perf_counter()
bar_length = 50
PROGRESS_BAR_CURRENT_PERCENTAGE = 0
def progress_bar(current, total):
global PROGRESS_BAR_CURRENT_PERCENTAGE
if DEBUG:
return
percent_overall = round(100 * current / total)
if percent_overall <= PROGRESS_BAR_CURRENT_PERCENTAGE:
return
PROGRESS_BAR_CURRENT_PERCENTAGE = percent_overall
sys.stdout.write("\r")
sys.stdout.write(
"Progress: [{:{}}] {:>3}% ".format(
"=" * int(percent_overall / (100 / bar_length)),
bar_length,
int(percent_overall),
)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"--debug", action="store_true", help="Enable debug mode, print debug messages"
)
parser.add_argument(
"--output-dir",
type=str,
default="./tools/localization/analysis",
help="Output directory for the results",
)
parser.add_argument(
"--write-found-to-file",
action="store_true",
help="Write the found strings to a file",
)
parser.add_argument(
"--write-not-found-to-file",
action="store_true",
help="Write the not found strings to a file",
)
parser.add_argument(
"--print-not-found",
action="store_true",
help="Print the not found strings",
)
parser.add_argument(
"--identify-found-in-files",
action="store_true",
help="Identify line-numbers using regex.",
)
parser.add_argument(
"--identify-line-numbers",
action="store_true",
help="Identify line-numbers using regex.",
)
parser.add_argument(
"--disable-concurrency",
action="store_true",
help="Disable multiprocessing concurrency.",
)
parser.add_argument(
"--find-potential-matches",
action="store_true",
help="Find potential matched strings using very lazy regex.",
)
parser.add_argument(
"--delete-unused-keys",
action="store_true",
help="Delete unused keys."
)
sys.stdout.flush()
args = parser.parse_args()
# Read json file and get all keys
parse_locale_file_time_start = time.perf_counter()
with open(EN_PATH, 'r', encoding='utf-8') as messages_file:
key_list = json.load(messages_file).keys()
number_of_keys = len(key_list)
console.info(f"Loaded {number_of_keys} keys to search for")
parse_locale_file_time_end = time.perf_counter()
# Configuration
DEBUG = args.debug
CONCURRENCY_ENABLED = not args.disable_concurrency
if CONCURRENCY_ENABLED and (args.identify_found_in_files or args.identify_line_numbers):
CONCURRENCY_ENABLED = False
console.info(f"Concurrency is disabled when --identify-found-in-files or --identify-line-numbers is used")
def search_string_in_regex_list(regex_list, file_content):
return any(matcher.search(file_content) for matcher in regex_list)
if CONCURRENCY_ENABLED:
console.info(f"Concurrency enabled. Use --disable-concurrency to disable concurrency.")
console.enableDebug() if DEBUG else None
def load_file(file_path):
console.debug(f"Loading {file_path} into memory")
return open(file_path, "r", encoding="utf-8").read()
OUTPUT_DIR = args.output_dir
FOUND_STRINGS_PATH = os.path.join(OUTPUT_DIR, "found_strings.csv")
NOT_FOUND_STRINGS_PATH = os.path.join(OUTPUT_DIR, "not_found_strings.txt")
POTENTIAL_MATCHES_PATH = os.path.join(OUTPUT_DIR, "potential_matches.csv")
NOT_IN_MASTER_LIST_PATH = os.path.join(OUTPUT_DIR, "not_in_master_list.csv")
EN_PATH = "_locales/en/messages.json"
read_files_time_start = time.perf_counter()
loaded_files = [load_file(file_path) for file_path in files]
read_files_time_end = time.perf_counter()
# Remove files that are to be generated if they exist
removeFileIfExists(FOUND_STRINGS_PATH)
removeFileIfExists(NOT_FOUND_STRINGS_PATH)
removeFileIfExists(POTENTIAL_MATCHES_PATH)
removeFileIfExists(NOT_IN_MASTER_LIST_PATH)
def find_key(key):
regex_list = localization_regex_as_list(key)
return key if any(search_string_in_regex_list(regex_list, file_content) for file_content in loaded_files) else None
def flush():
sys.stdout.flush() if not DEBUG else None
def process_keys_concurrently():
with multiprocessing.Pool() as pool:
result = pool.map(find_key, key_list)
return set(result)
# File search setup
console.info("Scanning for localized strings...")
files_to_ignore = ["./ts/localization/locales.ts"]
ignore_patterns = [re.compile(re.escape(pattern)) for pattern in files_to_ignore]
console.debug(f"Ignoring files: {', '.join(files_to_ignore)}")
REGEX_TIME_TRACKER = 0.0
def should_ignore_file(path):
return any(pattern.search(path) for pattern in ignore_patterns)
def regex_find(regex_list, file_content):
global REGEX_TIME_TRACKER # Declare the variable as global
regex_start = time.perf_counter()
found = search_string_in_regex_list(regex_list, file_content)
regex_end = time.perf_counter()
REGEX_TIME_TRACKER += (regex_end - regex_start) # Correct time calculation
return found
def find_files_with_extension(root_dir, extensions):
for entry in os.scandir(root_dir):
if entry.is_dir():
yield from find_files_with_extension(entry.path, extensions)
elif any(entry.name.endswith(ext) for ext in extensions) and not should_ignore_file(entry.path):
yield entry.path
def print_search(search_key, search_info=""):
console.debug(f"{search_key:<{42}} | {search_info}")
os_walk_time_start = time.perf_counter()
files = set(find_files_with_extension("./ts/", (".ts", ".tsx")))
files.update(
[
y
for x in os.listdir("./")
for y in glob.glob(os.path.join(x[0], "*preload.js"))
if not should_ignore_file(y)
]
)
os_walk_time_end = time.perf_counter()
def process_keys():
found_strings_and_locations = {} # Dictionary to store found strings and their locations
found_strings_set = set() # Set to store found strings
not_found_strings_set = set() # Set to store not found strings
for_loop_iterations = {}
if DEBUG:
for_loop_iterations["keys"] = 0
for_loop_iterations["files"] = 0
for_loop_iterations["lines"] = 0
for i in range(number_of_keys):
key = key_list[i]
regex_list = localization_regex_as_list(key)
bar_length = 50
progress_bar(
i, number_of_keys
)
PROGRESS_BAR_CURRENT_PERCENTAGE = 0
def progress_bar(current, total):
global PROGRESS_BAR_CURRENT_PERCENTAGE
if DEBUG:
for_loop_iterations["keys"] += 1
return
percent_overall = round(100 * current / total)
if percent_overall <= PROGRESS_BAR_CURRENT_PERCENTAGE:
return
PROGRESS_BAR_CURRENT_PERCENTAGE = percent_overall
sys.stdout.write("\r")
sys.stdout.write(
"Progress: [{:{}}] {:>3}% ".format(
"=" * int(percent_overall / (100 / bar_length)),
bar_length,
int(percent_overall),
)
)
sys.stdout.flush()
print_search(key, f"Searching")
locations = []
j = -1
for file_path in files:
j += 1
# Read json file and get all keys
parse_locale_file_time_start = time.perf_counter()
with open(EN_PATH, 'r', encoding='utf-8') as messages_file:
key_list = json.load(messages_file).keys()
number_of_keys = len(key_list)
console.info(f"Loaded {number_of_keys} keys to search for")
parse_locale_file_time_end = time.perf_counter()
if DEBUG:
for_loop_iterations["files"] += 1
if not regex_find(regex_list, loaded_files[j]):
continue
def search_string_in_regex_list(regex_list, file_content):
return any(matcher.search(file_content) for matcher in regex_list)
found_strings_set.add(key)
print_search(key, f"Found string in {file_path}")
def load_file(file_path):
console.debug(f"Loading {file_path} into memory")
return open(file_path, "r", encoding="utf-8").read()
if args.identify_line_numbers:
for line_number, line in enumerate(loaded_files[j].split("\n"), start=1):
if DEBUG:
for_loop_iterations["lines"] += 1
if regex_find(regex_list, line):
locations.append(f"./{file_path}:{line_number}")
read_files_time_start = time.perf_counter()
loaded_files = [load_file(file_path) for file_path in files]
read_files_time_end = time.perf_counter()
if key not in found_strings_set:
not_found_strings_set.add(key)
print_search(key, f"Not Found")
if locations:
print_search(key, f"Found in {len(locations)} files")
found_strings_and_locations[key] = locations
if DEBUG:
console.debug(for_loop_iterations)
return found_strings_set, not_found_strings_set, found_strings_and_locations
found_strings_and_locations = None
processing_time_start = time.perf_counter()
if CONCURRENCY_ENABLED:
results_set = process_keys_concurrently()
found_keys = set(key_list).intersection(results_set)
not_found_keys = set(key_list).difference(results_set)
else:
found_keys, not_found_keys, found_strings_and_locations = process_keys()
processing_time_end = time.perf_counter()
progress_bar(1, 1)
flush()
# Writing found strings and their locations to a CSV file
if args.write_found_to_file and found_strings_and_locations is not None:
makeDirIfNotExists(FOUND_STRINGS_PATH)
with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["String", "Locations"]) # Header row
for foundString, locations in found_strings_and_locations.items():
# Write each found string and its locations. Locations are joined into a single string for CSV simplicity
csvwriter.writerow(
[foundString, "; ".join(locations)]
)
def find_key(key):
regex_list = localization_regex_as_list(key)
return key if any(search_string_in_regex_list(regex_list, file_content) for file_content in loaded_files) else None
# Writing not found strings to a text file as before
if args.write_not_found_to_file:
makeDirIfNotExists(NOT_FOUND_STRINGS_PATH)
with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file:
for notFound in not_found_keys:
not_found_file.write(f"{notFound}\n")
num_found = len(found_keys)
num_not_found = len(not_found_keys)
def process_keys_concurrently():
with multiprocessing.Pool() as pool:
result = pool.map(find_key, key_list)
return set(result)
sys.stdout.write("\n")
# Print the result statistics and file paths (linkable)
if args.print_not_found:
[print(key) for key in sorted(not_found_keys)]
REGEX_TIME_TRACKER = 0.0
def find_key_lazy(key):
i = -1
regex = re.compile(fr"['\"]{re.escape(key)}['\"]")
for file_path in files:
i += 1
if regex.search(loaded_files[i]):
return key, file_path
return None, None
def regex_find(regex_list, file_content):
global REGEX_TIME_TRACKER # Declare the variable as global
regex_start = time.perf_counter()
found = search_string_in_regex_list(regex_list, file_content)
regex_end = time.perf_counter()
REGEX_TIME_TRACKER += (regex_end - regex_start) # Correct time calculation
return found
def find_lazy_matches_for_not_found():
with multiprocessing.Pool() as pool:
result = pool.map(find_key_lazy, not_found_keys)
return set(result)
def print_search(search_key, search_info=""):
console.debug(f"{search_key:<{42}} | {search_info}")
potential_matches = set()
if args.find_potential_matches:
potential_matches = find_lazy_matches_for_not_found()
potential_matches.discard((None, None))
[console.info(f"{key:<{42}} | Potential Match: {file_name}") for key, file_name in potential_matches]
console.info(f"Found {len(potential_matches)} potential matches")
def process_keys():
found_strings_and_locations = {} # Dictionary to store found strings and their locations
found_strings_set = set() # Set to store found strings
not_found_strings_set = set() # Set to store not found strings
for_loop_iterations = {}
if DEBUG:
for_loop_iterations["keys"] = 0
for_loop_iterations["files"] = 0
for_loop_iterations["lines"] = 0
for i in range(number_of_keys):
key = key_list[i]
regex_list = localization_regex_as_list(key)
progress_bar(
i, number_of_keys
)
console.info(
f"Found {num_found}/{number_of_keys} ({(num_found / number_of_keys):.0%}) strings in {len(files)} files")
if DEBUG:
for_loop_iterations["keys"] += 1
if args.find_potential_matches and len(potential_matches) > 0:
console.info(
f"(Including all potential matches) Found {num_found + len(potential_matches)}/{number_of_keys} ({((num_found + len(potential_matches)) / number_of_keys):.0%}) strings in {len(files)} files")
print_search(key, f"Searching")
if args.write_found_to_file:
console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}")
locations = []
j = -1
for file_path in files:
j += 1
if args.write_not_found_to_file:
console.info(
f"Identified {num_not_found} not found strings and written to: {NOT_FOUND_STRINGS_PATH}"
)
else:
console.info(f"Identified {num_not_found} not found strings")
if DEBUG:
for_loop_iterations["files"] += 1
if not regex_find(regex_list, loaded_files[j]):
continue
found_strings_set.add(key)
if DEBUG and REGEX_TIME_TRACKER > 0:
console.debug(f"Time spend in regex land: {REGEX_TIME_TRACKER:0.4f} seconds")
print_search(key, f"Found string in {file_path}")
if DEBUG:
os_walk_time = os_walk_time_end - os_walk_time_start
parse_locale_time = parse_locale_file_time_end - parse_locale_file_time_start
read_files_time = read_files_time_end - read_files_time_start
processing_time = processing_time_end - processing_time_start
console.debug(f"OS Walk reading time: {os_walk_time:0.4f} seconds")
console.debug(f"Locale File parse time: {parse_locale_time:0.4f} seconds")
console.debug(f"File reading time: {read_files_time:0.4f} seconds")
console.debug(f"Processing time: {processing_time:0.4f} seconds")
console.debug(
f"Total Elapsed Tracked Time: {os_walk_time + parse_locale_time + read_files_time + processing_time:0.4f} seconds")
if args.identify_line_numbers:
for line_number, line in enumerate(loaded_files[j].split("\n"), start=1):
if DEBUG:
for_loop_iterations["lines"] += 1
timer.stop()
if regex_find(regex_list, line):
locations.append(f"./{file_path}:{line_number}")
if key not in found_strings_set:
not_found_strings_set.add(key)
print_search(key, f"Not Found")
if locations:
print_search(key, f"Found in {len(locations)} files")
found_strings_and_locations[key] = locations
if DEBUG:
console.debug(for_loop_iterations)
return found_strings_set, not_found_strings_set, found_strings_and_locations
found_strings_and_locations = None
processing_time_start = time.perf_counter()
if CONCURRENCY_ENABLED:
results_set = process_keys_concurrently()
found_keys = set(key_list).intersection(results_set)
not_found_keys = set(key_list).difference(results_set)
else:
found_keys, not_found_keys, found_strings_and_locations = process_keys()
processing_time_end = time.perf_counter()
progress_bar(1, 1)
flush()
# Writing found strings and their locations to a CSV file
if args.write_found_to_file and found_strings_and_locations is not None:
makeDirIfNotExists(FOUND_STRINGS_PATH)
with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["String", "Locations"]) # Header row
for foundString, locations in found_strings_and_locations.items():
# Write each found string and its locations. Locations are joined into a single string for CSV simplicity
csvwriter.writerow(
[foundString, "; ".join(locations)]
)
# Writing not found strings to a text file as before
if args.write_not_found_to_file:
makeDirIfNotExists(NOT_FOUND_STRINGS_PATH)
with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file:
for notFound in not_found_keys:
not_found_file.write(f"{notFound}\n")
num_found = len(found_keys)
num_not_found = len(not_found_keys)
sys.stdout.write("\n")
# Print the result statistics and file paths (linkable)
if args.print_not_found:
[print(key) for key in sorted(not_found_keys)]
def find_key_lazy(key):
i = -1
regex = re.compile(fr"['\"]{re.escape(key)}['\"]")
for file_path in files:
i += 1
if regex.search(loaded_files[i]):
return key, file_path
return None, None
def remove_keys_from_json(json_file_path, keys_to_remove):
# Load the JSON data from the file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
# Remove the specified keys from the JSON data
data = {key: value for key, value in data.items() if key not in keys_to_remove}
def find_lazy_matches_for_not_found():
with multiprocessing.Pool() as pool:
result = pool.map(find_key_lazy, not_found_keys)
return set(result)
# Write the updated data back to the original JSON file
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f"Keys removed and JSON file updated: {json_file_path}")
potential_matches = set()
if args.find_potential_matches:
potential_matches = find_lazy_matches_for_not_found()
potential_matches.discard((None, None))
[console.info(f"{key:<{42}} | Potential Match: {file_name}") for key, file_name in potential_matches]
console.info(f"Found {len(potential_matches)} potential matches")
console.info(
f"Found {num_found}/{number_of_keys} ({(num_found / number_of_keys):.0%}) strings in {len(files)} files")
if args.find_potential_matches and len(potential_matches) > 0:
console.info(
f"(Including all potential matches) Found {num_found + len(potential_matches)}/{number_of_keys} ({((num_found + len(potential_matches)) / number_of_keys):.0%}) strings in {len(files)} files")
if args.delete_unused_keys:
locale_files = find_files_with_extension("./_locales", "message.json")
for locale_file in locale_files:
remove_keys_from_json(locale_file, not_found_keys)
if args.write_found_to_file:
console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}")
if args.write_not_found_to_file:
console.info(
f"Identified {num_not_found} not found strings and written to: {NOT_FOUND_STRINGS_PATH}"
)
else:
console.info(f"Identified {num_not_found} not found strings")
if DEBUG and REGEX_TIME_TRACKER > 0:
console.debug(f"Time spend in regex land: {REGEX_TIME_TRACKER:0.4f} seconds")
if DEBUG:
os_walk_time = os_walk_time_end - os_walk_time_start
parse_locale_time = parse_locale_file_time_end - parse_locale_file_time_start
read_files_time = read_files_time_end - read_files_time_start
processing_time = processing_time_end - processing_time_start
console.debug(f"OS Walk reading time: {os_walk_time:0.4f} seconds")
console.debug(f"Locale File parse time: {parse_locale_time:0.4f} seconds")
console.debug(f"File reading time: {read_files_time:0.4f} seconds")
console.debug(f"Processing time: {processing_time:0.4f} seconds")
console.debug(
f"Total Elapsed Tracked Time: {os_walk_time + parse_locale_time + read_files_time + processing_time:0.4f} seconds")
timer.stop()
def remove_keys_from_json(json_file_path, keys_to_remove):
# Load the JSON data from the file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
# Remove the specified keys from the JSON data
data = {key: value for key, value in data.items() if key not in keys_to_remove}
# Write the updated data back to the original JSON file
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f"Keys removed and JSON file updated: {json_file_path}")
if args.delete_unused_keys:
locale_files = find_files_with_extension("./_locales", "message.json")
for locale_file in locale_files:
remove_keys_from_json(locale_file, not_found_keys)

Loading…
Cancel
Save