From 53adc815b12da699becce25ccc9b0b4db37227e4 Mon Sep 17 00:00:00 2001 From: Audric Ackermann Date: Thu, 29 Aug 2024 17:33:29 +1000 Subject: [PATCH] Revert "fix: analyse locale CI failing with worker pool" This reverts commit 919abc03b6b32ec4a85b08d714dc0433485444e3. --- .../generateLocalizedStringsAnalysis.py | 622 +++++++++--------- 1 file changed, 309 insertions(+), 313 deletions(-) diff --git a/tools/localization/generateLocalizedStringsAnalysis.py b/tools/localization/generateLocalizedStringsAnalysis.py index b00ed2847..5e006eb5d 100755 --- a/tools/localization/generateLocalizedStringsAnalysis.py +++ b/tools/localization/generateLocalizedStringsAnalysis.py @@ -21,373 +21,369 @@ from localization.regex import localization_regex_as_list from util.fileUtils import makeDirIfNotExists, removeFileIfExists from util.logger import console -if __name__ == '__main__': - - - - parser = argparse.ArgumentParser() - parser.add_argument( - "--debug", action="store_true", help="Enable debug mode, print debug messages" - ) - parser.add_argument( - "--output-dir", - type=str, - default="./tools/localization/analysis", - help="Output directory for the results", - ) - parser.add_argument( - "--write-found-to-file", - action="store_true", - help="Write the found strings to a file", - ) - parser.add_argument( - "--write-not-found-to-file", - action="store_true", - help="Write the not found strings to a file", - ) - parser.add_argument( - "--print-not-found", - action="store_true", - help="Print the not found strings", - ) - parser.add_argument( - "--identify-found-in-files", - action="store_true", - help="Identify line-numbers using regex.", - ) - parser.add_argument( - "--identify-line-numbers", - action="store_true", - help="Identify line-numbers using regex.", - ) - parser.add_argument( - "--disable-concurrency", - action="store_true", - help="Disable multiprocessing concurrency.", - ) - parser.add_argument( - "--find-potential-matches", - action="store_true", - help="Find potential matched strings using very lazy regex.", - ) - parser.add_argument( - "--delete-unused-keys", - action="store_true", - help="Delete unused keys." +parser = argparse.ArgumentParser() +parser.add_argument( + "--debug", action="store_true", help="Enable debug mode, print debug messages" +) +parser.add_argument( + "--output-dir", + type=str, + default="./tools/localization/analysis", + help="Output directory for the results", +) +parser.add_argument( + "--write-found-to-file", + action="store_true", + help="Write the found strings to a file", +) +parser.add_argument( + "--write-not-found-to-file", + action="store_true", + help="Write the not found strings to a file", +) +parser.add_argument( + "--print-not-found", + action="store_true", + help="Print the not found strings", +) +parser.add_argument( + "--identify-found-in-files", + action="store_true", + help="Identify line-numbers using regex.", +) +parser.add_argument( + "--identify-line-numbers", + action="store_true", + help="Identify line-numbers using regex.", +) +parser.add_argument( + "--disable-concurrency", + action="store_true", + help="Disable multiprocessing concurrency.", +) +parser.add_argument( + "--find-potential-matches", + action="store_true", + help="Find potential matched strings using very lazy regex.", +) +parser.add_argument( + "--delete-unused-keys", + action="store_true", + help="Delete unused keys." +) + +args = parser.parse_args() + +# Configuration +DEBUG = args.debug +CONCURRENCY_ENABLED = not args.disable_concurrency + +if CONCURRENCY_ENABLED and (args.identify_found_in_files or args.identify_line_numbers): + CONCURRENCY_ENABLED = False + console.info(f"Concurrency is disabled when --identify-found-in-files or --identify-line-numbers is used") + +if CONCURRENCY_ENABLED: + console.info(f"Concurrency enabled. Use --disable-concurrency to disable concurrency.") + +console.enableDebug() if DEBUG else None + +OUTPUT_DIR = args.output_dir +FOUND_STRINGS_PATH = os.path.join(OUTPUT_DIR, "found_strings.csv") +NOT_FOUND_STRINGS_PATH = os.path.join(OUTPUT_DIR, "not_found_strings.txt") +POTENTIAL_MATCHES_PATH = os.path.join(OUTPUT_DIR, "potential_matches.csv") +NOT_IN_MASTER_LIST_PATH = os.path.join(OUTPUT_DIR, "not_in_master_list.csv") + +EN_PATH = "_locales/en/messages.json" + +# Remove files that are to be generated if they exist +removeFileIfExists(FOUND_STRINGS_PATH) +removeFileIfExists(NOT_FOUND_STRINGS_PATH) +removeFileIfExists(POTENTIAL_MATCHES_PATH) +removeFileIfExists(NOT_IN_MASTER_LIST_PATH) + + +def flush(): + sys.stdout.flush() if not DEBUG else None + + +# File search setup +console.info("Scanning for localized strings...") +files_to_ignore = ["./ts/localization/locales.ts"] +ignore_patterns = [re.compile(re.escape(pattern)) for pattern in files_to_ignore] + +console.debug(f"Ignoring files: {', '.join(files_to_ignore)}") + + +def should_ignore_file(path): + return any(pattern.search(path) for pattern in ignore_patterns) + + +def find_files_with_extension(root_dir, extensions): + for entry in os.scandir(root_dir): + if entry.is_dir(): + yield from find_files_with_extension(entry.path, extensions) + elif any(entry.name.endswith(ext) for ext in extensions) and not should_ignore_file(entry.path): + yield entry.path + + +os_walk_time_start = time.perf_counter() +files = set(find_files_with_extension("./ts/", (".ts", ".tsx"))) +files.update( + [ + y + for x in os.listdir("./") + for y in glob.glob(os.path.join(x[0], "*preload.js")) + if not should_ignore_file(y) + ] +) +os_walk_time_end = time.perf_counter() + +bar_length = 50 + +PROGRESS_BAR_CURRENT_PERCENTAGE = 0 + + +def progress_bar(current, total): + global PROGRESS_BAR_CURRENT_PERCENTAGE + if DEBUG: + return + percent_overall = round(100 * current / total) + if percent_overall <= PROGRESS_BAR_CURRENT_PERCENTAGE: + return + PROGRESS_BAR_CURRENT_PERCENTAGE = percent_overall + sys.stdout.write("\r") + sys.stdout.write( + "Progress: [{:{}}] {:>3}% ".format( + "=" * int(percent_overall / (100 / bar_length)), + bar_length, + int(percent_overall), + ) ) + sys.stdout.flush() - args = parser.parse_args() - - # Configuration - DEBUG = args.debug - CONCURRENCY_ENABLED = not args.disable_concurrency - if CONCURRENCY_ENABLED and (args.identify_found_in_files or args.identify_line_numbers): - CONCURRENCY_ENABLED = False - console.info(f"Concurrency is disabled when --identify-found-in-files or --identify-line-numbers is used") +# Read json file and get all keys +parse_locale_file_time_start = time.perf_counter() +with open(EN_PATH, 'r', encoding='utf-8') as messages_file: + key_list = json.load(messages_file).keys() +number_of_keys = len(key_list) +console.info(f"Loaded {number_of_keys} keys to search for") +parse_locale_file_time_end = time.perf_counter() - if CONCURRENCY_ENABLED: - console.info(f"Concurrency enabled. Use --disable-concurrency to disable concurrency.") - console.enableDebug() if DEBUG else None +def search_string_in_regex_list(regex_list, file_content): + return any(matcher.search(file_content) for matcher in regex_list) - OUTPUT_DIR = args.output_dir - FOUND_STRINGS_PATH = os.path.join(OUTPUT_DIR, "found_strings.csv") - NOT_FOUND_STRINGS_PATH = os.path.join(OUTPUT_DIR, "not_found_strings.txt") - POTENTIAL_MATCHES_PATH = os.path.join(OUTPUT_DIR, "potential_matches.csv") - NOT_IN_MASTER_LIST_PATH = os.path.join(OUTPUT_DIR, "not_in_master_list.csv") - EN_PATH = "_locales/en/messages.json" +def load_file(file_path): + console.debug(f"Loading {file_path} into memory") + return open(file_path, "r", encoding="utf-8").read() - # Remove files that are to be generated if they exist - removeFileIfExists(FOUND_STRINGS_PATH) - removeFileIfExists(NOT_FOUND_STRINGS_PATH) - removeFileIfExists(POTENTIAL_MATCHES_PATH) - removeFileIfExists(NOT_IN_MASTER_LIST_PATH) +read_files_time_start = time.perf_counter() +loaded_files = [load_file(file_path) for file_path in files] +read_files_time_end = time.perf_counter() - def flush(): - sys.stdout.flush() if not DEBUG else None +def find_key(key): + regex_list = localization_regex_as_list(key) + return key if any(search_string_in_regex_list(regex_list, file_content) for file_content in loaded_files) else None - # File search setup - console.info("Scanning for localized strings...") - files_to_ignore = ["./ts/localization/locales.ts"] - ignore_patterns = [re.compile(re.escape(pattern)) for pattern in files_to_ignore] - console.debug(f"Ignoring files: {', '.join(files_to_ignore)}") +def process_keys_concurrently(): + with multiprocessing.Pool() as pool: + result = pool.map(find_key, key_list) + return set(result) - def should_ignore_file(path): - return any(pattern.search(path) for pattern in ignore_patterns) +REGEX_TIME_TRACKER = 0.0 - def find_files_with_extension(root_dir, extensions): - for entry in os.scandir(root_dir): - if entry.is_dir(): - yield from find_files_with_extension(entry.path, extensions) - elif any(entry.name.endswith(ext) for ext in extensions) and not should_ignore_file(entry.path): - yield entry.path +def regex_find(regex_list, file_content): + global REGEX_TIME_TRACKER # Declare the variable as global + regex_start = time.perf_counter() + found = search_string_in_regex_list(regex_list, file_content) + regex_end = time.perf_counter() + REGEX_TIME_TRACKER += (regex_end - regex_start) # Correct time calculation + return found - os_walk_time_start = time.perf_counter() - files = set(find_files_with_extension("./ts/", (".ts", ".tsx"))) - files.update( - [ - y - for x in os.listdir("./") - for y in glob.glob(os.path.join(x[0], "*preload.js")) - if not should_ignore_file(y) - ] - ) - os_walk_time_end = time.perf_counter() - - bar_length = 50 +def print_search(search_key, search_info=""): + console.debug(f"{search_key:<{42}} | {search_info}") - PROGRESS_BAR_CURRENT_PERCENTAGE = 0 +def process_keys(): + found_strings_and_locations = {} # Dictionary to store found strings and their locations + found_strings_set = set() # Set to store found strings + not_found_strings_set = set() # Set to store not found strings + for_loop_iterations = {} + if DEBUG: + for_loop_iterations["keys"] = 0 + for_loop_iterations["files"] = 0 + for_loop_iterations["lines"] = 0 + for i in range(number_of_keys): + key = key_list[i] + regex_list = localization_regex_as_list(key) - def progress_bar(current, total): - global PROGRESS_BAR_CURRENT_PERCENTAGE - if DEBUG: - return - percent_overall = round(100 * current / total) - if percent_overall <= PROGRESS_BAR_CURRENT_PERCENTAGE: - return - PROGRESS_BAR_CURRENT_PERCENTAGE = percent_overall - sys.stdout.write("\r") - sys.stdout.write( - "Progress: [{:{}}] {:>3}% ".format( - "=" * int(percent_overall / (100 / bar_length)), - bar_length, - int(percent_overall), - ) + progress_bar( + i, number_of_keys ) - sys.stdout.flush() - - - # Read json file and get all keys - parse_locale_file_time_start = time.perf_counter() - with open(EN_PATH, 'r', encoding='utf-8') as messages_file: - key_list = json.load(messages_file).keys() - number_of_keys = len(key_list) - console.info(f"Loaded {number_of_keys} keys to search for") - parse_locale_file_time_end = time.perf_counter() + if DEBUG: + for_loop_iterations["keys"] += 1 - def search_string_in_regex_list(regex_list, file_content): - return any(matcher.search(file_content) for matcher in regex_list) - - - def load_file(file_path): - console.debug(f"Loading {file_path} into memory") - return open(file_path, "r", encoding="utf-8").read() - - - read_files_time_start = time.perf_counter() - loaded_files = [load_file(file_path) for file_path in files] - read_files_time_end = time.perf_counter() - + print_search(key, f"Searching") - def find_key(key): - regex_list = localization_regex_as_list(key) - return key if any(search_string_in_regex_list(regex_list, file_content) for file_content in loaded_files) else None + locations = [] + j = -1 + for file_path in files: + j += 1 + if DEBUG: + for_loop_iterations["files"] += 1 - def process_keys_concurrently(): - with multiprocessing.Pool() as pool: - result = pool.map(find_key, key_list) - return set(result) + if not regex_find(regex_list, loaded_files[j]): + continue + found_strings_set.add(key) - REGEX_TIME_TRACKER = 0.0 + print_search(key, f"Found string in {file_path}") + if args.identify_line_numbers: + for line_number, line in enumerate(loaded_files[j].split("\n"), start=1): + if DEBUG: + for_loop_iterations["lines"] += 1 - def regex_find(regex_list, file_content): - global REGEX_TIME_TRACKER # Declare the variable as global - regex_start = time.perf_counter() - found = search_string_in_regex_list(regex_list, file_content) - regex_end = time.perf_counter() - REGEX_TIME_TRACKER += (regex_end - regex_start) # Correct time calculation - return found + if regex_find(regex_list, line): + locations.append(f"./{file_path}:{line_number}") + if key not in found_strings_set: + not_found_strings_set.add(key) + print_search(key, f"Not Found") + if locations: + print_search(key, f"Found in {len(locations)} files") + found_strings_and_locations[key] = locations - def print_search(search_key, search_info=""): - console.debug(f"{search_key:<{42}} | {search_info}") + if DEBUG: + console.debug(for_loop_iterations) + return found_strings_set, not_found_strings_set, found_strings_and_locations + + +found_strings_and_locations = None +processing_time_start = time.perf_counter() +if CONCURRENCY_ENABLED: + results_set = process_keys_concurrently() + found_keys = set(key_list).intersection(results_set) + not_found_keys = set(key_list).difference(results_set) +else: + found_keys, not_found_keys, found_strings_and_locations = process_keys() +processing_time_end = time.perf_counter() + +progress_bar(1, 1) +flush() + +# Writing found strings and their locations to a CSV file +if args.write_found_to_file and found_strings_and_locations is not None: + makeDirIfNotExists(FOUND_STRINGS_PATH) + with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(["String", "Locations"]) # Header row + for foundString, locations in found_strings_and_locations.items(): + # Write each found string and its locations. Locations are joined into a single string for CSV simplicity + csvwriter.writerow( + [foundString, "; ".join(locations)] + ) +# Writing not found strings to a text file as before +if args.write_not_found_to_file: + makeDirIfNotExists(NOT_FOUND_STRINGS_PATH) + with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file: + for notFound in not_found_keys: + not_found_file.write(f"{notFound}\n") - def process_keys(): - found_strings_and_locations = {} # Dictionary to store found strings and their locations - found_strings_set = set() # Set to store found strings - not_found_strings_set = set() # Set to store not found strings - for_loop_iterations = {} - if DEBUG: - for_loop_iterations["keys"] = 0 - for_loop_iterations["files"] = 0 - for_loop_iterations["lines"] = 0 - for i in range(number_of_keys): - key = key_list[i] - regex_list = localization_regex_as_list(key) - - progress_bar( - i, number_of_keys - ) +num_found = len(found_keys) +num_not_found = len(not_found_keys) - if DEBUG: - for_loop_iterations["keys"] += 1 +sys.stdout.write("\n") +# Print the result statistics and file paths (linkable) - print_search(key, f"Searching") +if args.print_not_found: + [print(key) for key in sorted(not_found_keys)] - locations = [] - j = -1 - for file_path in files: - j += 1 - if DEBUG: - for_loop_iterations["files"] += 1 +def find_key_lazy(key): + i = -1 + regex = re.compile(fr"['\"]{re.escape(key)}['\"]") + for file_path in files: + i += 1 + if regex.search(loaded_files[i]): + return key, file_path + return None, None - if not regex_find(regex_list, loaded_files[j]): - continue - found_strings_set.add(key) +def find_lazy_matches_for_not_found(): + with multiprocessing.Pool() as pool: + result = pool.map(find_key_lazy, not_found_keys) + return set(result) - print_search(key, f"Found string in {file_path}") - if args.identify_line_numbers: - for line_number, line in enumerate(loaded_files[j].split("\n"), start=1): - if DEBUG: - for_loop_iterations["lines"] += 1 +potential_matches = set() +if args.find_potential_matches: + potential_matches = find_lazy_matches_for_not_found() + potential_matches.discard((None, None)) + [console.info(f"{key:<{42}} | Potential Match: {file_name}") for key, file_name in potential_matches] + console.info(f"Found {len(potential_matches)} potential matches") - if regex_find(regex_list, line): - locations.append(f"./{file_path}:{line_number}") +console.info( + f"Found {num_found}/{number_of_keys} ({(num_found / number_of_keys):.0%}) strings in {len(files)} files") - if key not in found_strings_set: - not_found_strings_set.add(key) - print_search(key, f"Not Found") - if locations: - print_search(key, f"Found in {len(locations)} files") - found_strings_and_locations[key] = locations +if args.find_potential_matches and len(potential_matches) > 0: + console.info( + f"(Including all potential matches) Found {num_found + len(potential_matches)}/{number_of_keys} ({((num_found + len(potential_matches)) / number_of_keys):.0%}) strings in {len(files)} files") - if DEBUG: - console.debug(for_loop_iterations) - return found_strings_set, not_found_strings_set, found_strings_and_locations - - - found_strings_and_locations = None - processing_time_start = time.perf_counter() - if CONCURRENCY_ENABLED: - results_set = process_keys_concurrently() - found_keys = set(key_list).intersection(results_set) - not_found_keys = set(key_list).difference(results_set) - else: - found_keys, not_found_keys, found_strings_and_locations = process_keys() - processing_time_end = time.perf_counter() - - progress_bar(1, 1) - flush() - - # Writing found strings and their locations to a CSV file - if args.write_found_to_file and found_strings_and_locations is not None: - makeDirIfNotExists(FOUND_STRINGS_PATH) - with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile: - csvwriter = csv.writer(csvfile) - csvwriter.writerow(["String", "Locations"]) # Header row - for foundString, locations in found_strings_and_locations.items(): - # Write each found string and its locations. Locations are joined into a single string for CSV simplicity - csvwriter.writerow( - [foundString, "; ".join(locations)] - ) - - # Writing not found strings to a text file as before - if args.write_not_found_to_file: - makeDirIfNotExists(NOT_FOUND_STRINGS_PATH) - with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file: - for notFound in not_found_keys: - not_found_file.write(f"{notFound}\n") - - num_found = len(found_keys) - num_not_found = len(not_found_keys) - - sys.stdout.write("\n") - # Print the result statistics and file paths (linkable) - - if args.print_not_found: - [print(key) for key in sorted(not_found_keys)] - - - def find_key_lazy(key): - i = -1 - regex = re.compile(fr"['\"]{re.escape(key)}['\"]") - for file_path in files: - i += 1 - if regex.search(loaded_files[i]): - return key, file_path - return None, None +if args.write_found_to_file: + console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}") +if args.write_not_found_to_file: + console.info( + f"Identified {num_not_found} not found strings and written to: {NOT_FOUND_STRINGS_PATH}" + ) +else: + console.info(f"Identified {num_not_found} not found strings") - def find_lazy_matches_for_not_found(): - with multiprocessing.Pool() as pool: - result = pool.map(find_key_lazy, not_found_keys) - return set(result) +if DEBUG and REGEX_TIME_TRACKER > 0: + console.debug(f"Time spend in regex land: {REGEX_TIME_TRACKER:0.4f} seconds") +if DEBUG: + os_walk_time = os_walk_time_end - os_walk_time_start + parse_locale_time = parse_locale_file_time_end - parse_locale_file_time_start + read_files_time = read_files_time_end - read_files_time_start + processing_time = processing_time_end - processing_time_start + console.debug(f"OS Walk reading time: {os_walk_time:0.4f} seconds") + console.debug(f"Locale File parse time: {parse_locale_time:0.4f} seconds") + console.debug(f"File reading time: {read_files_time:0.4f} seconds") + console.debug(f"Processing time: {processing_time:0.4f} seconds") + console.debug( + f"Total Elapsed Tracked Time: {os_walk_time + parse_locale_time + read_files_time + processing_time:0.4f} seconds") - potential_matches = set() - if args.find_potential_matches: - potential_matches = find_lazy_matches_for_not_found() - potential_matches.discard((None, None)) - [console.info(f"{key:<{42}} | Potential Match: {file_name}") for key, file_name in potential_matches] - console.info(f"Found {len(potential_matches)} potential matches") +timer.stop() - console.info( - f"Found {num_found}/{number_of_keys} ({(num_found / number_of_keys):.0%}) strings in {len(files)} files") - if args.find_potential_matches and len(potential_matches) > 0: - console.info( - f"(Including all potential matches) Found {num_found + len(potential_matches)}/{number_of_keys} ({((num_found + len(potential_matches)) / number_of_keys):.0%}) strings in {len(files)} files") +def remove_keys_from_json(json_file_path, keys_to_remove): + # Load the JSON data from the file + with open(json_file_path, 'r', encoding='utf-8') as json_file: + data = json.load(json_file) - if args.write_found_to_file: - console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}") + # Remove the specified keys from the JSON data + data = {key: value for key, value in data.items() if key not in keys_to_remove} - if args.write_not_found_to_file: - console.info( - f"Identified {num_not_found} not found strings and written to: {NOT_FOUND_STRINGS_PATH}" - ) - else: - console.info(f"Identified {num_not_found} not found strings") + # Write the updated data back to the original JSON file + with open(json_file_path, 'w', encoding='utf-8') as json_file: + json.dump(data, json_file, ensure_ascii=False, indent=4) + print(f"Keys removed and JSON file updated: {json_file_path}") - if DEBUG and REGEX_TIME_TRACKER > 0: - console.debug(f"Time spend in regex land: {REGEX_TIME_TRACKER:0.4f} seconds") - if DEBUG: - os_walk_time = os_walk_time_end - os_walk_time_start - parse_locale_time = parse_locale_file_time_end - parse_locale_file_time_start - read_files_time = read_files_time_end - read_files_time_start - processing_time = processing_time_end - processing_time_start - console.debug(f"OS Walk reading time: {os_walk_time:0.4f} seconds") - console.debug(f"Locale File parse time: {parse_locale_time:0.4f} seconds") - console.debug(f"File reading time: {read_files_time:0.4f} seconds") - console.debug(f"Processing time: {processing_time:0.4f} seconds") - console.debug( - f"Total Elapsed Tracked Time: {os_walk_time + parse_locale_time + read_files_time + processing_time:0.4f} seconds") - - timer.stop() - - - def remove_keys_from_json(json_file_path, keys_to_remove): - # Load the JSON data from the file - with open(json_file_path, 'r', encoding='utf-8') as json_file: - data = json.load(json_file) - - # Remove the specified keys from the JSON data - data = {key: value for key, value in data.items() if key not in keys_to_remove} - - # Write the updated data back to the original JSON file - with open(json_file_path, 'w', encoding='utf-8') as json_file: - json.dump(data, json_file, ensure_ascii=False, indent=4) - print(f"Keys removed and JSON file updated: {json_file_path}") - - - if args.delete_unused_keys: - locale_files = find_files_with_extension("./_locales", "message.json") - for locale_file in locale_files: - remove_keys_from_json(locale_file, not_found_keys) +if args.delete_unused_keys: + locale_files = find_files_with_extension("./_locales", "message.json") + for locale_file in locale_files: + remove_keys_from_json(locale_file, not_found_keys)