From 8b16d15762a2016302191beb3a2346392083d7a9 Mon Sep 17 00:00:00 2001 From: Ryan Miller Date: Wed, 28 Aug 2024 14:29:57 +1000 Subject: [PATCH] chore: create find unused strings scripts --- .../generateLocalizedStringsAnalysis.py | 505 ++++++++++-------- tools/localization/regex.py | 42 +- 2 files changed, 313 insertions(+), 234 deletions(-) diff --git a/tools/localization/generateLocalizedStringsAnalysis.py b/tools/localization/generateLocalizedStringsAnalysis.py index d7fa0d009..197f3b735 100755 --- a/tools/localization/generateLocalizedStringsAnalysis.py +++ b/tools/localization/generateLocalizedStringsAnalysis.py @@ -5,50 +5,85 @@ import csv import re import glob import argparse +import multiprocessing import json +from functools import partial # This allows for importing from the localization and util directories NOTE: Auto importing tools will also prepend the import paths with "tools." this will not work and needs to be removed from import paths sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from util.time import ExecutionTimer +import time timer = ExecutionTimer() -from localization.parseDictionary import parse_dictionary -from localization.regex import localization_regex -from util.listUtils import missingFromSet, removeFromSet +from localization.regex import localization_regex_as_list from util.fileUtils import makeDirIfNotExists, removeFileIfExists from util.logger import console - parser = argparse.ArgumentParser() parser.add_argument( - "--debug", action="store_true", help="Enable debug mode, print debug messages" + "--debug", action="store_true", help="Enable debug mode, print debug messages" +) +parser.add_argument( + "--output-dir", + type=str, + default="./tools/localization/analysis", + help="Output directory for the results", +) +parser.add_argument( + "--write-found-to-file", + action="store_true", + help="Write the found strings to a file", +) +parser.add_argument( + "--write-not-found-to-file", + action="store_true", + help="Write the not found strings to a file", +) +parser.add_argument( + "--print-not-found", + action="store_true", + help="Print the not found strings", ) parser.add_argument( - "--output-dir", - type=str, - default="./tools/localization/analysis", - help="Output directory for the results", + "--identify-found-in-files", + action="store_true", + help="Identify line-numbers using regex.", ) parser.add_argument( - "--master-strings", - type=str, - default="./tools/localization/input/master_string_list.txt", - help="Path to the master string list", + "--identify-line-numbers", + action="store_true", + help="Identify line-numbers using regex.", ) parser.add_argument( - "--to-be-removed", - type=str, - default="./tools/localization/input/to_be_removed_list.txt", - help="Path to the list of strings to be removed", + "--disable-concurrency", + action="store_true", + help="Disable multiprocessing concurrency.", +) +parser.add_argument( + "--find-potential-matches", + action="store_true", + help="Find potential matched strings using very lazy regex.", +) +parser.add_argument( + "--delete-unused-keys", + action="store_true", + help="Delete unused keys." ) args = parser.parse_args() # Configuration -intentionallyUnusedStrings = [] DEBUG = args.debug +CONCURRENCY_ENABLED = not args.disable_concurrency + +if CONCURRENCY_ENABLED and (args.identify_found_in_files or args.identify_line_numbers): + CONCURRENCY_ENABLED = False + console.info(f"Concurrency is disabled when --identify-found-in-files or --identify-line-numbers is used") + +if CONCURRENCY_ENABLED: + console.info(f"Concurrency enabled. Use --disable-concurrency to disable concurrency.") console.enableDebug() if DEBUG else None @@ -60,9 +95,6 @@ NOT_IN_MASTER_LIST_PATH = os.path.join(OUTPUT_DIR, "not_in_master_list.csv") EN_PATH = "_locales/en/messages.json" -MASTER_STRINGS_PATH = args.master_strings -TO_BE_REMOVED_PATH = args.to_be_removed - # Remove files that are to be generated if they exist removeFileIfExists(FOUND_STRINGS_PATH) removeFileIfExists(NOT_FOUND_STRINGS_PATH) @@ -71,252 +103,287 @@ removeFileIfExists(NOT_IN_MASTER_LIST_PATH) def flush(): - sys.stdout.flush() if not DEBUG else None + sys.stdout.flush() if not DEBUG else None # File search setup console.info("Scanning for localized strings...") -files = [] -files_to_ignore = ["LocalizerKeys.ts"] -ignore_patterns = [re.compile(pattern) for pattern in files_to_ignore] +files_to_ignore = ["./ts/localization/locales.ts"] +ignore_patterns = [re.compile(re.escape(pattern)) for pattern in files_to_ignore] console.debug(f"Ignoring files: {", ".join(files_to_ignore)}") -def should_ignore_file(file_path): - return any(pattern.search(file_path) for pattern in ignore_patterns) +def should_ignore_file(path): + return any(pattern.search(path) for pattern in ignore_patterns) + + +def find_files_with_extension(root_dir, extensions): + for entry in os.scandir(root_dir): + if entry.is_dir(): + yield from find_files_with_extension(entry.path, extensions) + elif any(entry.name.endswith(ext) for ext in extensions) and not should_ignore_file(entry.path): + yield entry.path -for extension in ("*.ts", "*.tsx"): - files.extend( - [ - y - for x in os.walk("./ts/") - for y in glob.glob(os.path.join(x[0], extension)) - if not should_ignore_file(y) - ] +os_walk_time_start = time.perf_counter() +files = set(find_files_with_extension("./ts/", (".ts", ".tsx"))) +files.update( + [ + y + for x in os.listdir("./") + for y in glob.glob(os.path.join(x[0], "*preload.js")) + if not should_ignore_file(y) + ] +) +os_walk_time_end = time.perf_counter() + +bar_length = 50 + +PROGRESS_BAR_CURRENT_PERCENTAGE = 0 + + +def progress_bar(current, total): + global PROGRESS_BAR_CURRENT_PERCENTAGE + if DEBUG: + return + percent_overall = round(100 * current / total) + if percent_overall <= PROGRESS_BAR_CURRENT_PERCENTAGE: + return + PROGRESS_BAR_CURRENT_PERCENTAGE = percent_overall + sys.stdout.write("\r") + sys.stdout.write( + "Progress: [{:{}}] {:>3}% ".format( + "=" * int(percent_overall / (100 / bar_length)), + bar_length, + int(percent_overall), ) + ) + sys.stdout.flush() -foundStringsAndLocations = {} # Dictionary to store found strings and their locations -notFoundStrings = set() # Set to store not found strings -total_files = len(files) * 1.1 -bar_length = 25 +# Read json file and get all keys +parse_locale_file_time_start = time.perf_counter() +with open(EN_PATH, 'r', encoding='utf-8') as messages_file: + key_list = json.load(messages_file).keys() +number_of_keys = len(key_list) +console.info(f"Loaded {number_of_keys} keys to search for") +parse_locale_file_time_end = time.perf_counter() -def progress_bar(current, total, overallCurrent, overalTotal): - if DEBUG: - return - percent = 100.0 * current / total - percentOverall = 100.0 * overallCurrent / overalTotal - sys.stdout.write("\r") - sys.stdout.write( - "Overall: [{:{}}] {:>3}% ".format( - "=" * int(percentOverall / (100.0 / bar_length)), - bar_length, - int(percentOverall), - ) - ) - sys.stdout.write( - "Stage: [{:{}}] {:>3}%".format( - "=" * int(percent / (100.0 / bar_length)), bar_length, int(percent) - ) - ) - sys.stdout.flush() +def search_string_in_regex_list(regex_list, file_content): + return any(matcher.search(file_content) for matcher in regex_list) -current_line_number = 0 -current_file_number = 0 -line_count = 0 -keys = [] +def load_file(file_path): + console.debug(f"Loading {file_path} into memory") + return open(file_path, "r", encoding="utf-8").read() -with open(EN_PATH, "r", encoding="utf-8") as messages_file: - messages_dict = json.load(messages_file) -# Read json file and get all keys -with open(EN_PATH, "r", encoding="utf-8") as messages_file: - for line in messages_file: - for match in re.finditer(r'"([^"]+)":', line): - keys.append(match.group(1)) +read_files_time_start = time.perf_counter() +loaded_files = [load_file(file_path) for file_path in files] +read_files_time_end = time.perf_counter() -total_line_numbers = len(keys) -console.debug(f"Total keys: {total_line_numbers}") +def find_key(key): + regex_list = localization_regex_as_list(key) + return key if any(search_string_in_regex_list(regex_list, file_content) for file_content in loaded_files) else None -def format_vscode_path(file_path): - return file_path.replace("./", "") +def process_keys_concurrently(): + with multiprocessing.Pool() as pool: + result = pool.map(find_key, key_list) + return set(result) -# search -for key in keys: - if key in intentionallyUnusedStrings: - continue - searchedLine = localization_regex(key) +REGEX_TIME_TRACKER = 0.0 + + +def regex_find(regex_list, file_content): + global REGEX_TIME_TRACKER # Declare the variable as global + regex_start = time.perf_counter() + found = search_string_in_regex_list(regex_list, file_content) + regex_end = time.perf_counter() + REGEX_TIME_TRACKER += (regex_end - regex_start) # Correct time calculation + return found + + +def print_search(search_key, search_info=""): + console.debug(f"{search_key:<{42}} | {search_info}") + + +def process_keys(): + found_strings_and_locations = {} # Dictionary to store found strings and their locations + found_strings_set = set() # Set to store found strings + not_found_strings_set = set() # Set to store not found strings + for_loop_iterations = {} + if DEBUG: + for_loop_iterations["keys"] = 0 + for_loop_iterations["files"] = 0 + for_loop_iterations["lines"] = 0 + for i in range(number_of_keys): + key = key_list[i] + regex_list = localization_regex_as_list(key) + + progress_bar( + i, number_of_keys + ) + + if DEBUG: + for_loop_iterations["keys"] += 1 + + print_search(key, f"Searching") locations = [] - current_file_number = 0 # To keep track of the current file number for progress bar + j = -1 for file_path in files: - with open(file_path, "r", encoding="utf-8") as file_content: - content = file_content.read() - for line_number, line in enumerate(content.split("\n"), start=1): - if searchedLine.search(line): - locations.append(f"{format_vscode_path(file_path)}:{line_number}") - - current_file_number += 1 - progress_bar( - current_file_number, total_files, current_line_number, total_line_numbers - ) - current_line_number += 1 + j += 1 + + if DEBUG: + for_loop_iterations["files"] += 1 + + if not regex_find(regex_list, loaded_files[j]): + continue + + found_strings_set.add(key) + + print_search(key, f"Found string in {file_path}") + + if args.identify_line_numbers: + for line_number, line in enumerate(loaded_files[j].split("\n"), start=1): + if DEBUG: + for_loop_iterations["lines"] += 1 + + if regex_find(regex_list, line): + locations.append(f"./{file_path}:{line_number}") + + if key not in found_strings_set: + not_found_strings_set.add(key) + print_search(key, f"Not Found") if locations: - console.debug(f"{key} - Found in {len(locations)}") - foundStringsAndLocations[key] = locations - else: - console.debug(f"{key} - Not Found") - notFoundStrings.add(key) + print_search(key, f"Found in {len(locations)} files") + found_strings_and_locations[key] = locations + + if DEBUG: + console.debug(for_loop_iterations) + return found_strings_set, not_found_strings_set, found_strings_and_locations -progress_bar(1, 1, 1, 1) +found_strings_and_locations = None +processing_time_start = time.perf_counter() +if CONCURRENCY_ENABLED: + results_set = process_keys_concurrently() + found_keys = set(key_list).intersection(results_set) + not_found_keys = set(key_list).difference(results_set) +else: + found_keys, not_found_keys, found_strings_and_locations = process_keys() +processing_time_end = time.perf_counter() + +progress_bar(1, 1) flush() # Writing found strings and their locations to a CSV file -makeDirIfNotExists(FOUND_STRINGS_PATH) -with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile: +if args.write_found_to_file and found_strings_and_locations is not None: + makeDirIfNotExists(FOUND_STRINGS_PATH) + with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile: csvwriter = csv.writer(csvfile) - csvwriter.writerow(["String", "Phrase", "Locations"]) # Header row - for foundString, locations in foundStringsAndLocations.items(): - # Write each found string and its locations. Locations are joined into a single string for CSV simplicity - csvwriter.writerow( - [foundString, messages_dict[foundString], "; ".join(locations)] - ) + csvwriter.writerow(["String", "Locations"]) # Header row + for foundString, locations in found_strings_and_locations.items(): + # Write each found string and its locations. Locations are joined into a single string for CSV simplicity + csvwriter.writerow( + [foundString, "; ".join(locations)] + ) # Writing not found strings to a text file as before -makeDirIfNotExists(NOT_FOUND_STRINGS_PATH) -with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file: - for notFound in notFoundStrings: - not_found_file.write(f"{notFound}\n") +if args.write_not_found_to_file: + makeDirIfNotExists(NOT_FOUND_STRINGS_PATH) + with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file: + for notFound in not_found_keys: + not_found_file.write(f"{notFound}\n") + +num_found = len(found_keys) +num_not_found = len(not_found_keys) sys.stdout.write("\n") # Print the result statistics and file paths (linkable) -console.info(f"Found {len(foundStringsAndLocations)} strings in {len(files)} files") -console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}") -console.info( - f"Identified {len(notFoundStrings)} not found strings and written to: {NOT_FOUND_STRINGS_PATH}" -) +if args.print_not_found: + [print(key) for key in not_found_keys] -# Search for not found strings in any single quotes across all files -console.info("Searching for potential matches for not found strings...") -current_not_found_number = 0 -current_file_number = 0 -total_not_found_strings = len(notFoundStrings) -potentialMatches = ( - {} -) # Dictionary to store potential matches: {string: [file1, file2, ...]} -for string in notFoundStrings: - console.debug(f"Searching for: {string}") - current_file_number = 0 - quotedStringPattern = re.compile( - r"'{}'".format(string) - ) # Pattern to search for 'STRING' - for file_path in files: - with open(file_path, "r", encoding="utf-8") as file_content: - if quotedStringPattern.search(file_content.read()): - console.debug(f"Potential match found: {string} in {file_path}") - if string not in potentialMatches: - potentialMatches[string] = [] - potentialMatches[string].append(file_path) - current_file_number += 1 - progress_bar( - current_file_number, - total_files, - current_not_found_number, - total_not_found_strings, - ) - current_not_found_number += 1 - - -# Function to find the line numbers of matches within a specific file -def find_line_numbers(file_path, pattern): - line_numbers = [] - with open(file_path, "r", encoding="utf-8") as file: - for i, line in enumerate(file, start=1): - if pattern.search(line): - line_numbers.append(i) - return line_numbers - - -# Process the found files to add line numbers -for string, files in potentialMatches.items(): - for file_path in files: - quotedStringPattern = re.compile(r"'{}'".format(string)) - line_numbers = find_line_numbers(file_path, quotedStringPattern) - match_details = [f"{file_path}:{line}" for line in line_numbers] - potentialMatches[string] = match_details # Update with detailed matches - -# Writing potential matches to CSV, now with line numbers -makeDirIfNotExists(POTENTIAL_MATCHES_PATH) -with open(POTENTIAL_MATCHES_PATH, "w", encoding="utf-8", newline="") as csvfile: - csvwriter = csv.writer(csvfile) - csvwriter.writerow(["String", "Potential File Matches"]) - for string, matches in potentialMatches.items(): - csvwriter.writerow([string, "; ".join(matches)]) -sys.stdout.write("\n") -# Print the result statistics and file paths (linkable) +def find_key_lazy(key): + i = -1 + regex = re.compile(fr"['\"]{re.escape(key)}['\"]") + for file_path in files: + i += 1 + if regex.search(loaded_files[i]): + return key, file_path + return None, None + + +def find_lazy_matches_for_not_found(): + with multiprocessing.Pool() as pool: + result = pool.map(find_key_lazy, not_found_keys) + return set(result) + + +potential_matches = set() +if args.find_potential_matches: + potential_matches = find_lazy_matches_for_not_found() + potential_matches.discard((None, None)) + [console.info(f"{key:<{42}} | Potential Match: {file_name}") for key, file_name in potential_matches] + console.info(f"Found {len(potential_matches)} potential matches") + console.info( - f"Potential matches found for {len(potentialMatches)}/{len(notFoundStrings)} not found strings " -) -console.info(f"Potential matches written to: {POTENTIAL_MATCHES_PATH}") + f"Found {num_found}/{number_of_keys} ({(num_found / number_of_keys):.0%}) strings in {len(files)} files") -# Identify found strings that are not in the master string list -try: - masterStringList = set() - with open(MASTER_STRINGS_PATH, "r", encoding="utf-8") as masterListFile: - for line in masterListFile: - masterStringList.add(line.strip()) +if args.find_potential_matches and len(potential_matches) > 0: + console.info( + f"(Including all potential matches) Found {num_found + len(potential_matches)}/{number_of_keys} ({((num_found + len(potential_matches)) / number_of_keys):.0%}) strings in {len(files)} files") - notInMasterList = missingFromSet( - set(foundStringsAndLocations.keys()), masterStringList - ) +if args.write_found_to_file: + console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}") - try: - slatedForRemovalList = set() - with open(TO_BE_REMOVED_PATH, "r", encoding="utf-8") as slatedForRemovalFile: - for line in slatedForRemovalFile: - slatedForRemovalList.add(line.strip()) - notInMasterList = removeFromSet(notInMasterList, slatedForRemovalList) - except FileNotFoundError: - console.warn( - f"Strings to be removed list not found at: {TO_BE_REMOVED_PATH}. Skipping comparison." - ) - - # Output the found strings not in the master list to a CSV file - makeDirIfNotExists(NOT_IN_MASTER_LIST_PATH) - with open(NOT_IN_MASTER_LIST_PATH, "w", encoding="utf-8", newline="") as csvfile: - csvwriter = csv.writer(csvfile) - csvwriter.writerow(["String", "Phrase", "Locations"]) # Header row - for notInMaster in notInMasterList: - # Write each found string and its locations. Locations are joined into a single string for CSV simplicity - csvwriter.writerow( - [ - notInMaster, - messages_dict[notInMaster], - "; ".join(foundStringsAndLocations[notInMaster]), - ] - ) - console.info(f"Found {len(notInMasterList)} strings not in the master list") - console.info( - f"Found strings not in the master list written to: {NOT_IN_MASTER_LIST_PATH}" - ) -except FileNotFoundError: - console.warn( - f"Master string list not found at: {MASTER_STRINGS_PATH}. Skipping comparison." - ) +if args.write_not_found_to_file: + console.info( + f"Identified {num_not_found} not found strings and written to: {NOT_FOUND_STRINGS_PATH}" + ) +else: + console.info(f"Identified {num_not_found} not found strings") + +if DEBUG and REGEX_TIME_TRACKER > 0: + console.debug(f"Time spend in regex land: {REGEX_TIME_TRACKER:0.4f} seconds") if DEBUG: - console.warn( - "This script ran with debug enabled. Please disable debug mode for a cleaner output and faster execution." - ) + os_walk_time = os_walk_time_end - os_walk_time_start + parse_locale_time = parse_locale_file_time_end - parse_locale_file_time_start + read_files_time = read_files_time_end - read_files_time_start + processing_time = processing_time_end - processing_time_start + console.debug(f"OS Walk reading time: {os_walk_time:0.4f} seconds") + console.debug(f"Locale File parse time: {parse_locale_time:0.4f} seconds") + console.debug(f"File reading time: {read_files_time:0.4f} seconds") + console.debug(f"Processing time: {processing_time:0.4f} seconds") + console.debug( + f"Total Elapsed Tracked Time: {os_walk_time + parse_locale_time + read_files_time + processing_time:0.4f} seconds") timer.stop() + + +def remove_keys_from_json(json_file_path, keys_to_remove): + # Load the JSON data from the file + with open(json_file_path, 'r', encoding='utf-8') as json_file: + data = json.load(json_file) + + # Remove the specified keys from the JSON data + data = {key: value for key, value in data.items() if key not in keys_to_remove} + + # Write the updated data back to the original JSON file + with open(json_file_path, 'w', encoding='utf-8') as json_file: + json.dump(data, json_file, ensure_ascii=False, indent=4) + print(f"Keys removed and JSON file updated: {json_file_path}") + + +if args.delete_unused_keys: + locale_files = find_files_with_extension("./_locales", "message.json") + for locale_file in locale_files: + remove_keys_from_json(locale_file, not_found_keys) diff --git a/tools/localization/regex.py b/tools/localization/regex.py index 77ac55a97..1be444ffe 100644 --- a/tools/localization/regex.py +++ b/tools/localization/regex.py @@ -1,21 +1,33 @@ import re -def localization_regex(string): - e_str = re.escape(string) +# The regex statements are designed to shortcut so are ordered from most common to least common. The advanced cases will also detect the same result as the simple cases. This is fine. +def get_localization_regex_list(string): + key = re.escape(string) + # Regex is ordered from most common to least common + return [ + fr"window\.i18n\('{key}'\)", + fr"window\.i18n\('{key}'(, {{[\S\s.]*}})?\)", + fr"\{{ token: '{key}'(, args: {{.*}})? \}}", + # This also captures the same group as `basic_object` but this is fine because basic_object shortcuts before reaching here if found. + fr"{{\s+token: '{key}',?\s+(\s*args: {{[\S\s.]*}},)?\s+\}}", + fr"window\.i18n\.(stripped|inEnglish|getRawMessage)\('{key}'(, {{[\S\s.]*}})?\)", + fr"]*?token=["\']{}["\'][^>]*?>'.format(e_str) - res_token = r'token=["\']{}["\']'.format(e_str) - res_8n_stripped = r"window\.i18n\.stripped\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str) - res_8n_inEnglish = r"window\.i18n\.inEnglish\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str) - res_8n_raw = r"window\.i18n\.getRawMessage\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str) - res_get_string = r"getString\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str) - res_i18n_args = r"{\s*token: '" + e_str + r"'(?:,\s*(?:[^\)]+?))?\s*}" - return re.compile( - f"{rex_b}|{rex_l}|{res_8n}|{res_comp}|{res_token}|{res_get_string}|{res_8n_stripped}|{res_8n_inEnglish}|{res_8n_raw}|{res_i18n_args}", - re.DOTALL, +def localization_regex_as_list(string): + regex_ordered = get_localization_regex_list(string) + regex_compiled_list = [] + for regex in regex_ordered: + regex_compiled_list.append( + re.compile(regex, re.DOTALL) ) + + return regex_compiled_list