chore: create find unused strings scripts

pull/3206/head
Ryan Miller 7 months ago
parent 017b4f564d
commit 8b16d15762

@ -5,50 +5,85 @@ import csv
import re import re
import glob import glob
import argparse import argparse
import multiprocessing
import json import json
from functools import partial
# This allows for importing from the localization and util directories NOTE: Auto importing tools will also prepend the import paths with "tools." this will not work and needs to be removed from import paths # This allows for importing from the localization and util directories NOTE: Auto importing tools will also prepend the import paths with "tools." this will not work and needs to be removed from import paths
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from util.time import ExecutionTimer from util.time import ExecutionTimer
import time
timer = ExecutionTimer() timer = ExecutionTimer()
from localization.parseDictionary import parse_dictionary from localization.regex import localization_regex_as_list
from localization.regex import localization_regex
from util.listUtils import missingFromSet, removeFromSet
from util.fileUtils import makeDirIfNotExists, removeFileIfExists from util.fileUtils import makeDirIfNotExists, removeFileIfExists
from util.logger import console from util.logger import console
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--debug", action="store_true", help="Enable debug mode, print debug messages" "--debug", action="store_true", help="Enable debug mode, print debug messages"
)
parser.add_argument(
"--output-dir",
type=str,
default="./tools/localization/analysis",
help="Output directory for the results",
)
parser.add_argument(
"--write-found-to-file",
action="store_true",
help="Write the found strings to a file",
)
parser.add_argument(
"--write-not-found-to-file",
action="store_true",
help="Write the not found strings to a file",
)
parser.add_argument(
"--print-not-found",
action="store_true",
help="Print the not found strings",
) )
parser.add_argument( parser.add_argument(
"--output-dir", "--identify-found-in-files",
type=str, action="store_true",
default="./tools/localization/analysis", help="Identify line-numbers using regex.",
help="Output directory for the results",
) )
parser.add_argument( parser.add_argument(
"--master-strings", "--identify-line-numbers",
type=str, action="store_true",
default="./tools/localization/input/master_string_list.txt", help="Identify line-numbers using regex.",
help="Path to the master string list",
) )
parser.add_argument( parser.add_argument(
"--to-be-removed", "--disable-concurrency",
type=str, action="store_true",
default="./tools/localization/input/to_be_removed_list.txt", help="Disable multiprocessing concurrency.",
help="Path to the list of strings to be removed", )
parser.add_argument(
"--find-potential-matches",
action="store_true",
help="Find potential matched strings using very lazy regex.",
)
parser.add_argument(
"--delete-unused-keys",
action="store_true",
help="Delete unused keys."
) )
args = parser.parse_args() args = parser.parse_args()
# Configuration # Configuration
intentionallyUnusedStrings = []
DEBUG = args.debug DEBUG = args.debug
CONCURRENCY_ENABLED = not args.disable_concurrency
if CONCURRENCY_ENABLED and (args.identify_found_in_files or args.identify_line_numbers):
CONCURRENCY_ENABLED = False
console.info(f"Concurrency is disabled when --identify-found-in-files or --identify-line-numbers is used")
if CONCURRENCY_ENABLED:
console.info(f"Concurrency enabled. Use --disable-concurrency to disable concurrency.")
console.enableDebug() if DEBUG else None console.enableDebug() if DEBUG else None
@ -60,9 +95,6 @@ NOT_IN_MASTER_LIST_PATH = os.path.join(OUTPUT_DIR, "not_in_master_list.csv")
EN_PATH = "_locales/en/messages.json" EN_PATH = "_locales/en/messages.json"
MASTER_STRINGS_PATH = args.master_strings
TO_BE_REMOVED_PATH = args.to_be_removed
# Remove files that are to be generated if they exist # Remove files that are to be generated if they exist
removeFileIfExists(FOUND_STRINGS_PATH) removeFileIfExists(FOUND_STRINGS_PATH)
removeFileIfExists(NOT_FOUND_STRINGS_PATH) removeFileIfExists(NOT_FOUND_STRINGS_PATH)
@ -71,252 +103,287 @@ removeFileIfExists(NOT_IN_MASTER_LIST_PATH)
def flush(): def flush():
sys.stdout.flush() if not DEBUG else None sys.stdout.flush() if not DEBUG else None
# File search setup # File search setup
console.info("Scanning for localized strings...") console.info("Scanning for localized strings...")
files = [] files_to_ignore = ["./ts/localization/locales.ts"]
files_to_ignore = ["LocalizerKeys.ts"] ignore_patterns = [re.compile(re.escape(pattern)) for pattern in files_to_ignore]
ignore_patterns = [re.compile(pattern) for pattern in files_to_ignore]
console.debug(f"Ignoring files: {", ".join(files_to_ignore)}") console.debug(f"Ignoring files: {", ".join(files_to_ignore)}")
def should_ignore_file(file_path): def should_ignore_file(path):
return any(pattern.search(file_path) for pattern in ignore_patterns) return any(pattern.search(path) for pattern in ignore_patterns)
def find_files_with_extension(root_dir, extensions):
for entry in os.scandir(root_dir):
if entry.is_dir():
yield from find_files_with_extension(entry.path, extensions)
elif any(entry.name.endswith(ext) for ext in extensions) and not should_ignore_file(entry.path):
yield entry.path
for extension in ("*.ts", "*.tsx"): os_walk_time_start = time.perf_counter()
files.extend( files = set(find_files_with_extension("./ts/", (".ts", ".tsx")))
[ files.update(
y [
for x in os.walk("./ts/") y
for y in glob.glob(os.path.join(x[0], extension)) for x in os.listdir("./")
if not should_ignore_file(y) for y in glob.glob(os.path.join(x[0], "*preload.js"))
] if not should_ignore_file(y)
]
)
os_walk_time_end = time.perf_counter()
bar_length = 50
PROGRESS_BAR_CURRENT_PERCENTAGE = 0
def progress_bar(current, total):
global PROGRESS_BAR_CURRENT_PERCENTAGE
if DEBUG:
return
percent_overall = round(100 * current / total)
if percent_overall <= PROGRESS_BAR_CURRENT_PERCENTAGE:
return
PROGRESS_BAR_CURRENT_PERCENTAGE = percent_overall
sys.stdout.write("\r")
sys.stdout.write(
"Progress: [{:{}}] {:>3}% ".format(
"=" * int(percent_overall / (100 / bar_length)),
bar_length,
int(percent_overall),
) )
)
sys.stdout.flush()
foundStringsAndLocations = {} # Dictionary to store found strings and their locations
notFoundStrings = set() # Set to store not found strings
total_files = len(files) * 1.1
bar_length = 25
# Read json file and get all keys
parse_locale_file_time_start = time.perf_counter()
with open(EN_PATH, 'r', encoding='utf-8') as messages_file:
key_list = json.load(messages_file).keys()
number_of_keys = len(key_list)
console.info(f"Loaded {number_of_keys} keys to search for")
parse_locale_file_time_end = time.perf_counter()
def progress_bar(current, total, overallCurrent, overalTotal):
if DEBUG:
return
percent = 100.0 * current / total
percentOverall = 100.0 * overallCurrent / overalTotal
sys.stdout.write("\r")
sys.stdout.write(
"Overall: [{:{}}] {:>3}% ".format(
"=" * int(percentOverall / (100.0 / bar_length)),
bar_length,
int(percentOverall),
)
)
sys.stdout.write(
"Stage: [{:{}}] {:>3}%".format(
"=" * int(percent / (100.0 / bar_length)), bar_length, int(percent)
)
)
sys.stdout.flush()
def search_string_in_regex_list(regex_list, file_content):
return any(matcher.search(file_content) for matcher in regex_list)
current_line_number = 0
current_file_number = 0
line_count = 0
keys = []
def load_file(file_path):
console.debug(f"Loading {file_path} into memory")
return open(file_path, "r", encoding="utf-8").read()
with open(EN_PATH, "r", encoding="utf-8") as messages_file:
messages_dict = json.load(messages_file)
# Read json file and get all keys read_files_time_start = time.perf_counter()
with open(EN_PATH, "r", encoding="utf-8") as messages_file: loaded_files = [load_file(file_path) for file_path in files]
for line in messages_file: read_files_time_end = time.perf_counter()
for match in re.finditer(r'"([^"]+)":', line):
keys.append(match.group(1))
total_line_numbers = len(keys)
console.debug(f"Total keys: {total_line_numbers}")
def find_key(key):
regex_list = localization_regex_as_list(key)
return key if any(search_string_in_regex_list(regex_list, file_content) for file_content in loaded_files) else None
def format_vscode_path(file_path):
return file_path.replace("./", "")
def process_keys_concurrently():
with multiprocessing.Pool() as pool:
result = pool.map(find_key, key_list)
return set(result)
# search
for key in keys:
if key in intentionallyUnusedStrings:
continue
searchedLine = localization_regex(key) REGEX_TIME_TRACKER = 0.0
def regex_find(regex_list, file_content):
global REGEX_TIME_TRACKER # Declare the variable as global
regex_start = time.perf_counter()
found = search_string_in_regex_list(regex_list, file_content)
regex_end = time.perf_counter()
REGEX_TIME_TRACKER += (regex_end - regex_start) # Correct time calculation
return found
def print_search(search_key, search_info=""):
console.debug(f"{search_key:<{42}} | {search_info}")
def process_keys():
found_strings_and_locations = {} # Dictionary to store found strings and their locations
found_strings_set = set() # Set to store found strings
not_found_strings_set = set() # Set to store not found strings
for_loop_iterations = {}
if DEBUG:
for_loop_iterations["keys"] = 0
for_loop_iterations["files"] = 0
for_loop_iterations["lines"] = 0
for i in range(number_of_keys):
key = key_list[i]
regex_list = localization_regex_as_list(key)
progress_bar(
i, number_of_keys
)
if DEBUG:
for_loop_iterations["keys"] += 1
print_search(key, f"Searching")
locations = [] locations = []
current_file_number = 0 # To keep track of the current file number for progress bar j = -1
for file_path in files: for file_path in files:
with open(file_path, "r", encoding="utf-8") as file_content: j += 1
content = file_content.read()
for line_number, line in enumerate(content.split("\n"), start=1): if DEBUG:
if searchedLine.search(line): for_loop_iterations["files"] += 1
locations.append(f"{format_vscode_path(file_path)}:{line_number}")
if not regex_find(regex_list, loaded_files[j]):
current_file_number += 1 continue
progress_bar(
current_file_number, total_files, current_line_number, total_line_numbers found_strings_set.add(key)
)
current_line_number += 1 print_search(key, f"Found string in {file_path}")
if args.identify_line_numbers:
for line_number, line in enumerate(loaded_files[j].split("\n"), start=1):
if DEBUG:
for_loop_iterations["lines"] += 1
if regex_find(regex_list, line):
locations.append(f"./{file_path}:{line_number}")
if key not in found_strings_set:
not_found_strings_set.add(key)
print_search(key, f"Not Found")
if locations: if locations:
console.debug(f"{key} - Found in {len(locations)}") print_search(key, f"Found in {len(locations)} files")
foundStringsAndLocations[key] = locations found_strings_and_locations[key] = locations
else:
console.debug(f"{key} - Not Found") if DEBUG:
notFoundStrings.add(key) console.debug(for_loop_iterations)
return found_strings_set, not_found_strings_set, found_strings_and_locations
progress_bar(1, 1, 1, 1)
found_strings_and_locations = None
processing_time_start = time.perf_counter()
if CONCURRENCY_ENABLED:
results_set = process_keys_concurrently()
found_keys = set(key_list).intersection(results_set)
not_found_keys = set(key_list).difference(results_set)
else:
found_keys, not_found_keys, found_strings_and_locations = process_keys()
processing_time_end = time.perf_counter()
progress_bar(1, 1)
flush() flush()
# Writing found strings and their locations to a CSV file # Writing found strings and their locations to a CSV file
makeDirIfNotExists(FOUND_STRINGS_PATH) if args.write_found_to_file and found_strings_and_locations is not None:
with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile: makeDirIfNotExists(FOUND_STRINGS_PATH)
with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile:
csvwriter = csv.writer(csvfile) csvwriter = csv.writer(csvfile)
csvwriter.writerow(["String", "Phrase", "Locations"]) # Header row csvwriter.writerow(["String", "Locations"]) # Header row
for foundString, locations in foundStringsAndLocations.items(): for foundString, locations in found_strings_and_locations.items():
# Write each found string and its locations. Locations are joined into a single string for CSV simplicity # Write each found string and its locations. Locations are joined into a single string for CSV simplicity
csvwriter.writerow( csvwriter.writerow(
[foundString, messages_dict[foundString], "; ".join(locations)] [foundString, "; ".join(locations)]
) )
# Writing not found strings to a text file as before # Writing not found strings to a text file as before
makeDirIfNotExists(NOT_FOUND_STRINGS_PATH) if args.write_not_found_to_file:
with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file: makeDirIfNotExists(NOT_FOUND_STRINGS_PATH)
for notFound in notFoundStrings: with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file:
not_found_file.write(f"{notFound}\n") for notFound in not_found_keys:
not_found_file.write(f"{notFound}\n")
num_found = len(found_keys)
num_not_found = len(not_found_keys)
sys.stdout.write("\n") sys.stdout.write("\n")
# Print the result statistics and file paths (linkable) # Print the result statistics and file paths (linkable)
console.info(f"Found {len(foundStringsAndLocations)} strings in {len(files)} files")
console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}")
console.info( if args.print_not_found:
f"Identified {len(notFoundStrings)} not found strings and written to: {NOT_FOUND_STRINGS_PATH}" [print(key) for key in not_found_keys]
)
# Search for not found strings in any single quotes across all files
console.info("Searching for potential matches for not found strings...")
current_not_found_number = 0
current_file_number = 0
total_not_found_strings = len(notFoundStrings)
potentialMatches = (
{}
) # Dictionary to store potential matches: {string: [file1, file2, ...]}
for string in notFoundStrings:
console.debug(f"Searching for: {string}")
current_file_number = 0
quotedStringPattern = re.compile(
r"'{}'".format(string)
) # Pattern to search for 'STRING'
for file_path in files:
with open(file_path, "r", encoding="utf-8") as file_content:
if quotedStringPattern.search(file_content.read()):
console.debug(f"Potential match found: {string} in {file_path}")
if string not in potentialMatches:
potentialMatches[string] = []
potentialMatches[string].append(file_path)
current_file_number += 1
progress_bar(
current_file_number,
total_files,
current_not_found_number,
total_not_found_strings,
)
current_not_found_number += 1
# Function to find the line numbers of matches within a specific file
def find_line_numbers(file_path, pattern):
line_numbers = []
with open(file_path, "r", encoding="utf-8") as file:
for i, line in enumerate(file, start=1):
if pattern.search(line):
line_numbers.append(i)
return line_numbers
# Process the found files to add line numbers
for string, files in potentialMatches.items():
for file_path in files:
quotedStringPattern = re.compile(r"'{}'".format(string))
line_numbers = find_line_numbers(file_path, quotedStringPattern)
match_details = [f"{file_path}:{line}" for line in line_numbers]
potentialMatches[string] = match_details # Update with detailed matches
# Writing potential matches to CSV, now with line numbers
makeDirIfNotExists(POTENTIAL_MATCHES_PATH)
with open(POTENTIAL_MATCHES_PATH, "w", encoding="utf-8", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["String", "Potential File Matches"])
for string, matches in potentialMatches.items():
csvwriter.writerow([string, "; ".join(matches)])
sys.stdout.write("\n") def find_key_lazy(key):
# Print the result statistics and file paths (linkable) i = -1
regex = re.compile(fr"['\"]{re.escape(key)}['\"]")
for file_path in files:
i += 1
if regex.search(loaded_files[i]):
return key, file_path
return None, None
def find_lazy_matches_for_not_found():
with multiprocessing.Pool() as pool:
result = pool.map(find_key_lazy, not_found_keys)
return set(result)
potential_matches = set()
if args.find_potential_matches:
potential_matches = find_lazy_matches_for_not_found()
potential_matches.discard((None, None))
[console.info(f"{key:<{42}} | Potential Match: {file_name}") for key, file_name in potential_matches]
console.info(f"Found {len(potential_matches)} potential matches")
console.info( console.info(
f"Potential matches found for {len(potentialMatches)}/{len(notFoundStrings)} not found strings " f"Found {num_found}/{number_of_keys} ({(num_found / number_of_keys):.0%}) strings in {len(files)} files")
)
console.info(f"Potential matches written to: {POTENTIAL_MATCHES_PATH}")
# Identify found strings that are not in the master string list if args.find_potential_matches and len(potential_matches) > 0:
try: console.info(
masterStringList = set() f"(Including all potential matches) Found {num_found + len(potential_matches)}/{number_of_keys} ({((num_found + len(potential_matches)) / number_of_keys):.0%}) strings in {len(files)} files")
with open(MASTER_STRINGS_PATH, "r", encoding="utf-8") as masterListFile:
for line in masterListFile:
masterStringList.add(line.strip())
notInMasterList = missingFromSet( if args.write_found_to_file:
set(foundStringsAndLocations.keys()), masterStringList console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}")
)
try: if args.write_not_found_to_file:
slatedForRemovalList = set() console.info(
with open(TO_BE_REMOVED_PATH, "r", encoding="utf-8") as slatedForRemovalFile: f"Identified {num_not_found} not found strings and written to: {NOT_FOUND_STRINGS_PATH}"
for line in slatedForRemovalFile: )
slatedForRemovalList.add(line.strip()) else:
notInMasterList = removeFromSet(notInMasterList, slatedForRemovalList) console.info(f"Identified {num_not_found} not found strings")
except FileNotFoundError:
console.warn( if DEBUG and REGEX_TIME_TRACKER > 0:
f"Strings to be removed list not found at: {TO_BE_REMOVED_PATH}. Skipping comparison." console.debug(f"Time spend in regex land: {REGEX_TIME_TRACKER:0.4f} seconds")
)
# Output the found strings not in the master list to a CSV file
makeDirIfNotExists(NOT_IN_MASTER_LIST_PATH)
with open(NOT_IN_MASTER_LIST_PATH, "w", encoding="utf-8", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["String", "Phrase", "Locations"]) # Header row
for notInMaster in notInMasterList:
# Write each found string and its locations. Locations are joined into a single string for CSV simplicity
csvwriter.writerow(
[
notInMaster,
messages_dict[notInMaster],
"; ".join(foundStringsAndLocations[notInMaster]),
]
)
console.info(f"Found {len(notInMasterList)} strings not in the master list")
console.info(
f"Found strings not in the master list written to: {NOT_IN_MASTER_LIST_PATH}"
)
except FileNotFoundError:
console.warn(
f"Master string list not found at: {MASTER_STRINGS_PATH}. Skipping comparison."
)
if DEBUG: if DEBUG:
console.warn( os_walk_time = os_walk_time_end - os_walk_time_start
"This script ran with debug enabled. Please disable debug mode for a cleaner output and faster execution." parse_locale_time = parse_locale_file_time_end - parse_locale_file_time_start
) read_files_time = read_files_time_end - read_files_time_start
processing_time = processing_time_end - processing_time_start
console.debug(f"OS Walk reading time: {os_walk_time:0.4f} seconds")
console.debug(f"Locale File parse time: {parse_locale_time:0.4f} seconds")
console.debug(f"File reading time: {read_files_time:0.4f} seconds")
console.debug(f"Processing time: {processing_time:0.4f} seconds")
console.debug(
f"Total Elapsed Tracked Time: {os_walk_time + parse_locale_time + read_files_time + processing_time:0.4f} seconds")
timer.stop() timer.stop()
def remove_keys_from_json(json_file_path, keys_to_remove):
# Load the JSON data from the file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
# Remove the specified keys from the JSON data
data = {key: value for key, value in data.items() if key not in keys_to_remove}
# Write the updated data back to the original JSON file
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f"Keys removed and JSON file updated: {json_file_path}")
if args.delete_unused_keys:
locale_files = find_files_with_extension("./_locales", "message.json")
for locale_file in locale_files:
remove_keys_from_json(locale_file, not_found_keys)

@ -1,21 +1,33 @@
import re import re
def localization_regex(string): # The regex statements are designed to shortcut so are ordered from most common to least common. The advanced cases will also detect the same result as the simple cases. This is fine.
e_str = re.escape(string) def get_localization_regex_list(string):
key = re.escape(string)
# Regex is ordered from most common to least common
return [
fr"window\.i18n\('{key}'\)",
fr"window\.i18n\('{key}'(, {{[\S\s.]*}})?\)",
fr"\{{ token: '{key}'(, args: {{.*}})? \}}",
# This also captures the same group as `basic_object` but this is fine because basic_object shortcuts before reaching here if found.
fr"{{\s+token: '{key}',?\s+(\s*args: {{[\S\s.]*}},)?\s+\}}",
fr"window\.i18n\.(stripped|inEnglish|getRawMessage)\('{key}'(, {{[\S\s.]*}})?\)",
fr"<I18n[\S\s.]*token=\{{?['\"]{key}['\"]\}}?",
fr"<I18n[\S\s.]*token=[\S\s.]*{key}[\S\s.]*",
fr"i18n\('{key}'\)",
fr"i18n\('{key}'(, {{[\S\s.]*}})?\)",
fr"i18n\.(stripped|inEnglish|getRawMessage)\('{key}'(, {{[\S\s.]*}})?\)",
fr"window\?\.i18n\?\.\('{key}'(, {{[\S\s.]*}})?\)",
fr"<StyledI18nSubText[\S\s.]*token=[\S\s.]*{key}[\S\s.]*"
]
rex_b = r"i18n\([\r\n]?\s*'{}'|messages.{}|'{}'".format(e_str, e_str, e_str)
rex_l = r"localizedKey\s*=\s*'{}'".format(e_str)
res_8n = r"window\.i18n\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_comp = r'<I18n\s+[^>]*?token=["\']{}["\'][^>]*?>'.format(e_str)
res_token = r'token=["\']{}["\']'.format(e_str)
res_8n_stripped = r"window\.i18n\.stripped\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_8n_inEnglish = r"window\.i18n\.inEnglish\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_8n_raw = r"window\.i18n\.getRawMessage\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_get_string = r"getString\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_i18n_args = r"{\s*token: '" + e_str + r"'(?:,\s*(?:[^\)]+?))?\s*}"
return re.compile( def localization_regex_as_list(string):
f"{rex_b}|{rex_l}|{res_8n}|{res_comp}|{res_token}|{res_get_string}|{res_8n_stripped}|{res_8n_inEnglish}|{res_8n_raw}|{res_i18n_args}", regex_ordered = get_localization_regex_list(string)
re.DOTALL, regex_compiled_list = []
for regex in regex_ordered:
regex_compiled_list.append(
re.compile(regex, re.DOTALL)
) )
return regex_compiled_list

Loading…
Cancel
Save