chore: create find unused strings scripts

pull/3206/head
Ryan Miller 7 months ago
parent 017b4f564d
commit 8b16d15762

@ -5,50 +5,85 @@ import csv
import re
import glob
import argparse
import multiprocessing
import json
from functools import partial
# This allows for importing from the localization and util directories NOTE: Auto importing tools will also prepend the import paths with "tools." this will not work and needs to be removed from import paths
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from util.time import ExecutionTimer
import time
timer = ExecutionTimer()
from localization.parseDictionary import parse_dictionary
from localization.regex import localization_regex
from util.listUtils import missingFromSet, removeFromSet
from localization.regex import localization_regex_as_list
from util.fileUtils import makeDirIfNotExists, removeFileIfExists
from util.logger import console
parser = argparse.ArgumentParser()
parser.add_argument(
"--debug", action="store_true", help="Enable debug mode, print debug messages"
"--debug", action="store_true", help="Enable debug mode, print debug messages"
)
parser.add_argument(
"--output-dir",
type=str,
default="./tools/localization/analysis",
help="Output directory for the results",
)
parser.add_argument(
"--write-found-to-file",
action="store_true",
help="Write the found strings to a file",
)
parser.add_argument(
"--write-not-found-to-file",
action="store_true",
help="Write the not found strings to a file",
)
parser.add_argument(
"--print-not-found",
action="store_true",
help="Print the not found strings",
)
parser.add_argument(
"--output-dir",
type=str,
default="./tools/localization/analysis",
help="Output directory for the results",
"--identify-found-in-files",
action="store_true",
help="Identify line-numbers using regex.",
)
parser.add_argument(
"--master-strings",
type=str,
default="./tools/localization/input/master_string_list.txt",
help="Path to the master string list",
"--identify-line-numbers",
action="store_true",
help="Identify line-numbers using regex.",
)
parser.add_argument(
"--to-be-removed",
type=str,
default="./tools/localization/input/to_be_removed_list.txt",
help="Path to the list of strings to be removed",
"--disable-concurrency",
action="store_true",
help="Disable multiprocessing concurrency.",
)
parser.add_argument(
"--find-potential-matches",
action="store_true",
help="Find potential matched strings using very lazy regex.",
)
parser.add_argument(
"--delete-unused-keys",
action="store_true",
help="Delete unused keys."
)
args = parser.parse_args()
# Configuration
intentionallyUnusedStrings = []
DEBUG = args.debug
CONCURRENCY_ENABLED = not args.disable_concurrency
if CONCURRENCY_ENABLED and (args.identify_found_in_files or args.identify_line_numbers):
CONCURRENCY_ENABLED = False
console.info(f"Concurrency is disabled when --identify-found-in-files or --identify-line-numbers is used")
if CONCURRENCY_ENABLED:
console.info(f"Concurrency enabled. Use --disable-concurrency to disable concurrency.")
console.enableDebug() if DEBUG else None
@ -60,9 +95,6 @@ NOT_IN_MASTER_LIST_PATH = os.path.join(OUTPUT_DIR, "not_in_master_list.csv")
EN_PATH = "_locales/en/messages.json"
MASTER_STRINGS_PATH = args.master_strings
TO_BE_REMOVED_PATH = args.to_be_removed
# Remove files that are to be generated if they exist
removeFileIfExists(FOUND_STRINGS_PATH)
removeFileIfExists(NOT_FOUND_STRINGS_PATH)
@ -71,252 +103,287 @@ removeFileIfExists(NOT_IN_MASTER_LIST_PATH)
def flush():
sys.stdout.flush() if not DEBUG else None
sys.stdout.flush() if not DEBUG else None
# File search setup
console.info("Scanning for localized strings...")
files = []
files_to_ignore = ["LocalizerKeys.ts"]
ignore_patterns = [re.compile(pattern) for pattern in files_to_ignore]
files_to_ignore = ["./ts/localization/locales.ts"]
ignore_patterns = [re.compile(re.escape(pattern)) for pattern in files_to_ignore]
console.debug(f"Ignoring files: {", ".join(files_to_ignore)}")
def should_ignore_file(file_path):
return any(pattern.search(file_path) for pattern in ignore_patterns)
def should_ignore_file(path):
return any(pattern.search(path) for pattern in ignore_patterns)
def find_files_with_extension(root_dir, extensions):
for entry in os.scandir(root_dir):
if entry.is_dir():
yield from find_files_with_extension(entry.path, extensions)
elif any(entry.name.endswith(ext) for ext in extensions) and not should_ignore_file(entry.path):
yield entry.path
for extension in ("*.ts", "*.tsx"):
files.extend(
[
y
for x in os.walk("./ts/")
for y in glob.glob(os.path.join(x[0], extension))
if not should_ignore_file(y)
]
os_walk_time_start = time.perf_counter()
files = set(find_files_with_extension("./ts/", (".ts", ".tsx")))
files.update(
[
y
for x in os.listdir("./")
for y in glob.glob(os.path.join(x[0], "*preload.js"))
if not should_ignore_file(y)
]
)
os_walk_time_end = time.perf_counter()
bar_length = 50
PROGRESS_BAR_CURRENT_PERCENTAGE = 0
def progress_bar(current, total):
global PROGRESS_BAR_CURRENT_PERCENTAGE
if DEBUG:
return
percent_overall = round(100 * current / total)
if percent_overall <= PROGRESS_BAR_CURRENT_PERCENTAGE:
return
PROGRESS_BAR_CURRENT_PERCENTAGE = percent_overall
sys.stdout.write("\r")
sys.stdout.write(
"Progress: [{:{}}] {:>3}% ".format(
"=" * int(percent_overall / (100 / bar_length)),
bar_length,
int(percent_overall),
)
)
sys.stdout.flush()
foundStringsAndLocations = {} # Dictionary to store found strings and their locations
notFoundStrings = set() # Set to store not found strings
total_files = len(files) * 1.1
bar_length = 25
# Read json file and get all keys
parse_locale_file_time_start = time.perf_counter()
with open(EN_PATH, 'r', encoding='utf-8') as messages_file:
key_list = json.load(messages_file).keys()
number_of_keys = len(key_list)
console.info(f"Loaded {number_of_keys} keys to search for")
parse_locale_file_time_end = time.perf_counter()
def progress_bar(current, total, overallCurrent, overalTotal):
if DEBUG:
return
percent = 100.0 * current / total
percentOverall = 100.0 * overallCurrent / overalTotal
sys.stdout.write("\r")
sys.stdout.write(
"Overall: [{:{}}] {:>3}% ".format(
"=" * int(percentOverall / (100.0 / bar_length)),
bar_length,
int(percentOverall),
)
)
sys.stdout.write(
"Stage: [{:{}}] {:>3}%".format(
"=" * int(percent / (100.0 / bar_length)), bar_length, int(percent)
)
)
sys.stdout.flush()
def search_string_in_regex_list(regex_list, file_content):
return any(matcher.search(file_content) for matcher in regex_list)
current_line_number = 0
current_file_number = 0
line_count = 0
keys = []
def load_file(file_path):
console.debug(f"Loading {file_path} into memory")
return open(file_path, "r", encoding="utf-8").read()
with open(EN_PATH, "r", encoding="utf-8") as messages_file:
messages_dict = json.load(messages_file)
# Read json file and get all keys
with open(EN_PATH, "r", encoding="utf-8") as messages_file:
for line in messages_file:
for match in re.finditer(r'"([^"]+)":', line):
keys.append(match.group(1))
read_files_time_start = time.perf_counter()
loaded_files = [load_file(file_path) for file_path in files]
read_files_time_end = time.perf_counter()
total_line_numbers = len(keys)
console.debug(f"Total keys: {total_line_numbers}")
def find_key(key):
regex_list = localization_regex_as_list(key)
return key if any(search_string_in_regex_list(regex_list, file_content) for file_content in loaded_files) else None
def format_vscode_path(file_path):
return file_path.replace("./", "")
def process_keys_concurrently():
with multiprocessing.Pool() as pool:
result = pool.map(find_key, key_list)
return set(result)
# search
for key in keys:
if key in intentionallyUnusedStrings:
continue
searchedLine = localization_regex(key)
REGEX_TIME_TRACKER = 0.0
def regex_find(regex_list, file_content):
global REGEX_TIME_TRACKER # Declare the variable as global
regex_start = time.perf_counter()
found = search_string_in_regex_list(regex_list, file_content)
regex_end = time.perf_counter()
REGEX_TIME_TRACKER += (regex_end - regex_start) # Correct time calculation
return found
def print_search(search_key, search_info=""):
console.debug(f"{search_key:<{42}} | {search_info}")
def process_keys():
found_strings_and_locations = {} # Dictionary to store found strings and their locations
found_strings_set = set() # Set to store found strings
not_found_strings_set = set() # Set to store not found strings
for_loop_iterations = {}
if DEBUG:
for_loop_iterations["keys"] = 0
for_loop_iterations["files"] = 0
for_loop_iterations["lines"] = 0
for i in range(number_of_keys):
key = key_list[i]
regex_list = localization_regex_as_list(key)
progress_bar(
i, number_of_keys
)
if DEBUG:
for_loop_iterations["keys"] += 1
print_search(key, f"Searching")
locations = []
current_file_number = 0 # To keep track of the current file number for progress bar
j = -1
for file_path in files:
with open(file_path, "r", encoding="utf-8") as file_content:
content = file_content.read()
for line_number, line in enumerate(content.split("\n"), start=1):
if searchedLine.search(line):
locations.append(f"{format_vscode_path(file_path)}:{line_number}")
current_file_number += 1
progress_bar(
current_file_number, total_files, current_line_number, total_line_numbers
)
current_line_number += 1
j += 1
if DEBUG:
for_loop_iterations["files"] += 1
if not regex_find(regex_list, loaded_files[j]):
continue
found_strings_set.add(key)
print_search(key, f"Found string in {file_path}")
if args.identify_line_numbers:
for line_number, line in enumerate(loaded_files[j].split("\n"), start=1):
if DEBUG:
for_loop_iterations["lines"] += 1
if regex_find(regex_list, line):
locations.append(f"./{file_path}:{line_number}")
if key not in found_strings_set:
not_found_strings_set.add(key)
print_search(key, f"Not Found")
if locations:
console.debug(f"{key} - Found in {len(locations)}")
foundStringsAndLocations[key] = locations
else:
console.debug(f"{key} - Not Found")
notFoundStrings.add(key)
print_search(key, f"Found in {len(locations)} files")
found_strings_and_locations[key] = locations
if DEBUG:
console.debug(for_loop_iterations)
return found_strings_set, not_found_strings_set, found_strings_and_locations
progress_bar(1, 1, 1, 1)
found_strings_and_locations = None
processing_time_start = time.perf_counter()
if CONCURRENCY_ENABLED:
results_set = process_keys_concurrently()
found_keys = set(key_list).intersection(results_set)
not_found_keys = set(key_list).difference(results_set)
else:
found_keys, not_found_keys, found_strings_and_locations = process_keys()
processing_time_end = time.perf_counter()
progress_bar(1, 1)
flush()
# Writing found strings and their locations to a CSV file
makeDirIfNotExists(FOUND_STRINGS_PATH)
with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile:
if args.write_found_to_file and found_strings_and_locations is not None:
makeDirIfNotExists(FOUND_STRINGS_PATH)
with open(FOUND_STRINGS_PATH, "w", encoding="utf-8", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["String", "Phrase", "Locations"]) # Header row
for foundString, locations in foundStringsAndLocations.items():
# Write each found string and its locations. Locations are joined into a single string for CSV simplicity
csvwriter.writerow(
[foundString, messages_dict[foundString], "; ".join(locations)]
)
csvwriter.writerow(["String", "Locations"]) # Header row
for foundString, locations in found_strings_and_locations.items():
# Write each found string and its locations. Locations are joined into a single string for CSV simplicity
csvwriter.writerow(
[foundString, "; ".join(locations)]
)
# Writing not found strings to a text file as before
makeDirIfNotExists(NOT_FOUND_STRINGS_PATH)
with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file:
for notFound in notFoundStrings:
not_found_file.write(f"{notFound}\n")
if args.write_not_found_to_file:
makeDirIfNotExists(NOT_FOUND_STRINGS_PATH)
with open(NOT_FOUND_STRINGS_PATH, "w", encoding="utf-8") as not_found_file:
for notFound in not_found_keys:
not_found_file.write(f"{notFound}\n")
num_found = len(found_keys)
num_not_found = len(not_found_keys)
sys.stdout.write("\n")
# Print the result statistics and file paths (linkable)
console.info(f"Found {len(foundStringsAndLocations)} strings in {len(files)} files")
console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}")
console.info(
f"Identified {len(notFoundStrings)} not found strings and written to: {NOT_FOUND_STRINGS_PATH}"
)
if args.print_not_found:
[print(key) for key in not_found_keys]
# Search for not found strings in any single quotes across all files
console.info("Searching for potential matches for not found strings...")
current_not_found_number = 0
current_file_number = 0
total_not_found_strings = len(notFoundStrings)
potentialMatches = (
{}
) # Dictionary to store potential matches: {string: [file1, file2, ...]}
for string in notFoundStrings:
console.debug(f"Searching for: {string}")
current_file_number = 0
quotedStringPattern = re.compile(
r"'{}'".format(string)
) # Pattern to search for 'STRING'
for file_path in files:
with open(file_path, "r", encoding="utf-8") as file_content:
if quotedStringPattern.search(file_content.read()):
console.debug(f"Potential match found: {string} in {file_path}")
if string not in potentialMatches:
potentialMatches[string] = []
potentialMatches[string].append(file_path)
current_file_number += 1
progress_bar(
current_file_number,
total_files,
current_not_found_number,
total_not_found_strings,
)
current_not_found_number += 1
# Function to find the line numbers of matches within a specific file
def find_line_numbers(file_path, pattern):
line_numbers = []
with open(file_path, "r", encoding="utf-8") as file:
for i, line in enumerate(file, start=1):
if pattern.search(line):
line_numbers.append(i)
return line_numbers
# Process the found files to add line numbers
for string, files in potentialMatches.items():
for file_path in files:
quotedStringPattern = re.compile(r"'{}'".format(string))
line_numbers = find_line_numbers(file_path, quotedStringPattern)
match_details = [f"{file_path}:{line}" for line in line_numbers]
potentialMatches[string] = match_details # Update with detailed matches
# Writing potential matches to CSV, now with line numbers
makeDirIfNotExists(POTENTIAL_MATCHES_PATH)
with open(POTENTIAL_MATCHES_PATH, "w", encoding="utf-8", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["String", "Potential File Matches"])
for string, matches in potentialMatches.items():
csvwriter.writerow([string, "; ".join(matches)])
sys.stdout.write("\n")
# Print the result statistics and file paths (linkable)
def find_key_lazy(key):
i = -1
regex = re.compile(fr"['\"]{re.escape(key)}['\"]")
for file_path in files:
i += 1
if regex.search(loaded_files[i]):
return key, file_path
return None, None
def find_lazy_matches_for_not_found():
with multiprocessing.Pool() as pool:
result = pool.map(find_key_lazy, not_found_keys)
return set(result)
potential_matches = set()
if args.find_potential_matches:
potential_matches = find_lazy_matches_for_not_found()
potential_matches.discard((None, None))
[console.info(f"{key:<{42}} | Potential Match: {file_name}") for key, file_name in potential_matches]
console.info(f"Found {len(potential_matches)} potential matches")
console.info(
f"Potential matches found for {len(potentialMatches)}/{len(notFoundStrings)} not found strings "
)
console.info(f"Potential matches written to: {POTENTIAL_MATCHES_PATH}")
f"Found {num_found}/{number_of_keys} ({(num_found / number_of_keys):.0%}) strings in {len(files)} files")
# Identify found strings that are not in the master string list
try:
masterStringList = set()
with open(MASTER_STRINGS_PATH, "r", encoding="utf-8") as masterListFile:
for line in masterListFile:
masterStringList.add(line.strip())
if args.find_potential_matches and len(potential_matches) > 0:
console.info(
f"(Including all potential matches) Found {num_found + len(potential_matches)}/{number_of_keys} ({((num_found + len(potential_matches)) / number_of_keys):.0%}) strings in {len(files)} files")
notInMasterList = missingFromSet(
set(foundStringsAndLocations.keys()), masterStringList
)
if args.write_found_to_file:
console.info(f"Found strings and their locations written to: {FOUND_STRINGS_PATH}")
try:
slatedForRemovalList = set()
with open(TO_BE_REMOVED_PATH, "r", encoding="utf-8") as slatedForRemovalFile:
for line in slatedForRemovalFile:
slatedForRemovalList.add(line.strip())
notInMasterList = removeFromSet(notInMasterList, slatedForRemovalList)
except FileNotFoundError:
console.warn(
f"Strings to be removed list not found at: {TO_BE_REMOVED_PATH}. Skipping comparison."
)
# Output the found strings not in the master list to a CSV file
makeDirIfNotExists(NOT_IN_MASTER_LIST_PATH)
with open(NOT_IN_MASTER_LIST_PATH, "w", encoding="utf-8", newline="") as csvfile:
csvwriter = csv.writer(csvfile)
csvwriter.writerow(["String", "Phrase", "Locations"]) # Header row
for notInMaster in notInMasterList:
# Write each found string and its locations. Locations are joined into a single string for CSV simplicity
csvwriter.writerow(
[
notInMaster,
messages_dict[notInMaster],
"; ".join(foundStringsAndLocations[notInMaster]),
]
)
console.info(f"Found {len(notInMasterList)} strings not in the master list")
console.info(
f"Found strings not in the master list written to: {NOT_IN_MASTER_LIST_PATH}"
)
except FileNotFoundError:
console.warn(
f"Master string list not found at: {MASTER_STRINGS_PATH}. Skipping comparison."
)
if args.write_not_found_to_file:
console.info(
f"Identified {num_not_found} not found strings and written to: {NOT_FOUND_STRINGS_PATH}"
)
else:
console.info(f"Identified {num_not_found} not found strings")
if DEBUG and REGEX_TIME_TRACKER > 0:
console.debug(f"Time spend in regex land: {REGEX_TIME_TRACKER:0.4f} seconds")
if DEBUG:
console.warn(
"This script ran with debug enabled. Please disable debug mode for a cleaner output and faster execution."
)
os_walk_time = os_walk_time_end - os_walk_time_start
parse_locale_time = parse_locale_file_time_end - parse_locale_file_time_start
read_files_time = read_files_time_end - read_files_time_start
processing_time = processing_time_end - processing_time_start
console.debug(f"OS Walk reading time: {os_walk_time:0.4f} seconds")
console.debug(f"Locale File parse time: {parse_locale_time:0.4f} seconds")
console.debug(f"File reading time: {read_files_time:0.4f} seconds")
console.debug(f"Processing time: {processing_time:0.4f} seconds")
console.debug(
f"Total Elapsed Tracked Time: {os_walk_time + parse_locale_time + read_files_time + processing_time:0.4f} seconds")
timer.stop()
def remove_keys_from_json(json_file_path, keys_to_remove):
# Load the JSON data from the file
with open(json_file_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
# Remove the specified keys from the JSON data
data = {key: value for key, value in data.items() if key not in keys_to_remove}
# Write the updated data back to the original JSON file
with open(json_file_path, 'w', encoding='utf-8') as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
print(f"Keys removed and JSON file updated: {json_file_path}")
if args.delete_unused_keys:
locale_files = find_files_with_extension("./_locales", "message.json")
for locale_file in locale_files:
remove_keys_from_json(locale_file, not_found_keys)

@ -1,21 +1,33 @@
import re
def localization_regex(string):
e_str = re.escape(string)
# The regex statements are designed to shortcut so are ordered from most common to least common. The advanced cases will also detect the same result as the simple cases. This is fine.
def get_localization_regex_list(string):
key = re.escape(string)
# Regex is ordered from most common to least common
return [
fr"window\.i18n\('{key}'\)",
fr"window\.i18n\('{key}'(, {{[\S\s.]*}})?\)",
fr"\{{ token: '{key}'(, args: {{.*}})? \}}",
# This also captures the same group as `basic_object` but this is fine because basic_object shortcuts before reaching here if found.
fr"{{\s+token: '{key}',?\s+(\s*args: {{[\S\s.]*}},)?\s+\}}",
fr"window\.i18n\.(stripped|inEnglish|getRawMessage)\('{key}'(, {{[\S\s.]*}})?\)",
fr"<I18n[\S\s.]*token=\{{?['\"]{key}['\"]\}}?",
fr"<I18n[\S\s.]*token=[\S\s.]*{key}[\S\s.]*",
fr"i18n\('{key}'\)",
fr"i18n\('{key}'(, {{[\S\s.]*}})?\)",
fr"i18n\.(stripped|inEnglish|getRawMessage)\('{key}'(, {{[\S\s.]*}})?\)",
fr"window\?\.i18n\?\.\('{key}'(, {{[\S\s.]*}})?\)",
fr"<StyledI18nSubText[\S\s.]*token=[\S\s.]*{key}[\S\s.]*"
]
rex_b = r"i18n\([\r\n]?\s*'{}'|messages.{}|'{}'".format(e_str, e_str, e_str)
rex_l = r"localizedKey\s*=\s*'{}'".format(e_str)
res_8n = r"window\.i18n\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_comp = r'<I18n\s+[^>]*?token=["\']{}["\'][^>]*?>'.format(e_str)
res_token = r'token=["\']{}["\']'.format(e_str)
res_8n_stripped = r"window\.i18n\.stripped\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_8n_inEnglish = r"window\.i18n\.inEnglish\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_8n_raw = r"window\.i18n\.getRawMessage\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_get_string = r"getString\(\s*'{}'(?:,\s*(?:[^\)]+?))?\s*\)".format(e_str)
res_i18n_args = r"{\s*token: '" + e_str + r"'(?:,\s*(?:[^\)]+?))?\s*}"
return re.compile(
f"{rex_b}|{rex_l}|{res_8n}|{res_comp}|{res_token}|{res_get_string}|{res_8n_stripped}|{res_8n_inEnglish}|{res_8n_raw}|{res_i18n_args}",
re.DOTALL,
def localization_regex_as_list(string):
regex_ordered = get_localization_regex_list(string)
regex_compiled_list = []
for regex in regex_ordered:
regex_compiled_list.append(
re.compile(regex, re.DOTALL)
)
return regex_compiled_list

Loading…
Cancel
Save