From 4775559cf3bc9017641ea3bf2645d6d9136e9e4d Mon Sep 17 00:00:00 2001 From: Ryan Miller Date: Tue, 17 Sep 2024 16:39:07 +1000 Subject: [PATCH] chore: add validation flagging for strings with incorrect tag formatting --- tools/localization/dynamicVariables.py | 107 ++++++++++++++++++++++++- tools/localization/generateLocales.py | 48 +++++++++-- 2 files changed, 148 insertions(+), 7 deletions(-) diff --git a/tools/localization/dynamicVariables.py b/tools/localization/dynamicVariables.py index bb6e7dec0..6832fae4d 100644 --- a/tools/localization/dynamicVariables.py +++ b/tools/localization/dynamicVariables.py @@ -60,6 +60,100 @@ def extractVariablesFromDict(input_dict): return output_dict_new, output_dict_old +def extractDisallowedTags(input_dict, allowed_tags): + """ + Reads through a dictionary of key-value pairs and creates a new dictionary + where the value is just a list of tags that are not allowed as per the allowed_tags. + + Args: + input_dict (dict): The dictionary to extract tags from. + allowed_tags (list): A list of allowed tag names (e.g., ['b', 'br', 'span']). + + Returns: + dict: A dictionary with the same keys as input_dict, but the values are lists of disallowed tags. + """ + # Compile a regex to match any HTML-like tags + tag_pattern = re.compile(r'<(/?)(\w+)[^>]*>') + + # Create a set of allowed tags for quick lookup + allowed_tag_set = set(allowed_tags) + + output_dict = {} + for key, value in input_dict.items(): + disallowed_tags = [] + for match in tag_pattern.finditer(value): + tag_name = match.group(2) + if tag_name not in allowed_tag_set: + disallowed_tags.append(match.group(0)) + + output_dict[key] = disallowed_tags + + return output_dict + + +def findImproperTags(input_dict): + """ + Reads through a dictionary of key-value pairs and identifies any uses of angled brackets + that do not form a proper HTML tag. + + Args: + input_dict (dict): The dictionary to search for improper tags. + + Returns: + dict: A dictionary with the same keys as input_dict, but the values are lists of improper tags. + """ + # Regular expression to find improper use of angled brackets: + # 1. Matches a standalone '<' or '>' not forming a valid tag. + # 2. Matches text enclosed in angled brackets that do not form a valid HTML tag. + improper_tag_pattern = re.compile(r'<[^>]*>|>') + + output_dict = {} + for key, value in input_dict.items(): + # Find all improper tag matches + improper_tags = [match for match in improper_tag_pattern.findall(value) + if not re.match(r'<\s*/?\s*\w+.*?>', match)] + print(improper_tags) + + # Store the results in the output dictionary + output_dict[key] = improper_tags + + return output_dict + + +def flagInvalidAngleBrackets(input_dict, allowed_tag_starts): + """ + Flags an issue if a string contains an angled bracket '<' + but that angle bracket is not followed by a 'b' or an 's' (case-insensitive). + + Args: + input_dict (dict): A dictionary where the values are strings to check. + + Returns: + dict: A dictionary where keys are the same as input_dict, + and values are lists of issues found in the corresponding string. + """ + output_dict = {} + for key, value in input_dict.items(): + issues = [] + # Find all occurrences of '<' + indices = [m.start() for m in re.finditer('<', value)] + for idx in indices: + # Look ahead to find the next non-space character after '<' + match = re.match(r'\s*([^\s>])', value[idx + 1:]) + if match: + next_char = match.group(1) + if next_char.lower() not in allowed_tag_starts: + # Flag an issue + snippet = value[idx:idx + 10] # Extract a snippet for context + issues.append(f"Invalid tag starting with '<{next_char}' at position {idx}: '{snippet}'") + else: + # No non-space character after '<', flag an issue + issues.append(f"Invalid angle bracket '<' at position {idx}") + if issues: + output_dict[key] = issues + return output_dict + + def extractFormattingTags(input_dict): """ Reads through a dictionary of key-value pairs and creates a new dictionary @@ -74,17 +168,20 @@ def extractFormattingTags(input_dict): output_dict_b_tags = {} output_dict_br_tags = {} output_dict_span_tags = {} + disallowed_tags = extractDisallowedTags(input_dict, ["b", "br", "span"]) + improper_tags = findImproperTags(input_dict) + for key, value in input_dict.items(): console.debug(f"key: {key}, value: {value}") output_dict_b_tags[key] = extractAllMatches(value, r"(.*?)") output_dict_br_tags[key] = extractAllMatches(value, r"
") output_dict_span_tags[key] = extractAllMatches(value, r"(.*?)") - return output_dict_b_tags, output_dict_br_tags, output_dict_span_tags + return output_dict_b_tags, output_dict_br_tags, output_dict_span_tags, disallowed_tags, improper_tags def identifyLocaleDyanmicVariableDifferences(locales, locale_b_tags, locale_br_tags, - locale_span_tags): + locale_span_tags, locale_disallowed_tags, locale_improper_tags): """ Identifies the differences between each locale's dynamic variables. @@ -104,6 +201,8 @@ def identifyLocaleDyanmicVariableDifferences(locales, locale_b_tags, current_locale_b_tags = locale_b_tags[locale_name] current_locale_br_tags = locale_br_tags[locale_name] current_locale_span_tags = locale_span_tags[locale_name] + current_locale_disallowed_tags = locale_disallowed_tags[locale_name] + current_locale_improper_tags = locale_improper_tags[locale_name] if locale_name == "en": continue @@ -115,6 +214,8 @@ def identifyLocaleDyanmicVariableDifferences(locales, locale_b_tags, "missing_b_tags": {}, "missing_br_tags": {}, "missing_span_tags": {}, + "disallowed_tags": {}, + "improper_tags": {}, } for key, value in master_locale.items(): @@ -138,6 +239,8 @@ def identifyLocaleDyanmicVariableDifferences(locales, locale_b_tags, locale_issues["missing_b_tags"][key] = len(master_locale_b_tags[key]) - len(current_locale_b_tags[key]) locale_issues["missing_br_tags"][key] = len(master_locale_br_tags[key]) - len(current_locale_br_tags[key]) locale_issues["missing_span_tags"][key] = len(master_locale_span_tags[key]) - len(current_locale_span_tags[key]) + locale_issues["disallowed_tags"][key] = len(current_locale_disallowed_tags[key]) + locale_issues["improper_tags"][key] = len(current_locale_improper_tags[key]) for key in locale: if key not in master_locale: diff --git a/tools/localization/generateLocales.py b/tools/localization/generateLocales.py index 5d9ec9654..01e46d37f 100755 --- a/tools/localization/generateLocales.py +++ b/tools/localization/generateLocales.py @@ -118,6 +118,8 @@ localeVariablesOld = dict() locale_b_tags = dict() locale_br_tags = dict() locale_span_tags = dict() +locale_disallowed_tags = dict() +locale_improper_tags = dict() # Extract the dynamic variables from each locale and store them in a dictionary for locale, data in locales.items(): console.debug(f"Extracting dynamic variables for {locale}") @@ -129,11 +131,13 @@ for locale, data in locales.items(): locale_b_tags[locale], locale_br_tags[locale], locale_span_tags[locale], + locale_disallowed_tags[locale], + locale_improper_tags[locale], ) = extractFormattingTags(data) problems = identifyLocaleDyanmicVariableDifferences(localeVariables, locale_b_tags, locale_br_tags, - locale_span_tags, ) + locale_span_tags, locale_disallowed_tags, locale_improper_tags) found_old_dynamic_variables = identifyAndPrintOldDynamicVariables( localeVariablesOld, args.print_old_dynamic_variables @@ -182,8 +186,23 @@ if problems: string_to_locales[problem_string] = [locale] else: string_to_locales[problem_string].append(locale) + if "disallowed_tags" in locale_problems: + for problem_string, tag_issues in locale_problems["disallowed_tags"].items(): + print(f"problem string: {problem_string}, tag_issues: {tag_issues}") + if tag_issues > 0: + if problem_string not in string_to_locales: + string_to_locales[problem_string] = [locale] + else: + string_to_locales[problem_string].append(locale) + if "improper_tags" in locale_problems: + for problem_string, tag_issues in locale_problems["improper_tags"].items(): + if tag_issues > 0: + if problem_string not in string_to_locales: + string_to_locales[problem_string] = [locale] + else: + string_to_locales[problem_string].append(locale) - console.info(f"Problem strings: {json.dumps(string_to_locales, indent=2)}") + console.debug(f"Problem strings: {json.dumps(string_to_locales, indent=2)}") message += " See above for problem strings and which locales they are in." if args.print_problem_formatting_tag_strings: @@ -192,6 +211,8 @@ if problems: locale_missing_br_tags = set() locale_missing_b_tags = set() locale_missing_span_tags = set() + locale_disallowed_tags = set() + locale_improper_tags = set() if "missing_br_tags" in locale_problems: for problem_string, tag_issues in locale_problems["missing_br_tags"].items(): if tag_issues > 0: @@ -204,11 +225,21 @@ if problems: for problem_string, tag_issues in locale_problems["missing_span_tags"].items(): if tag_issues > 0: locale_missing_span_tags.add(problem_string) + if "disallowed_tags" in locale_problems: + for problem_string, tag_issues in locale_problems["disallowed_tags"].items(): + if tag_issues > 0: + locale_disallowed_tags.add(problem_string) + if "improper_tags" in locale_problems: + for problem_string, tag_issues in locale_problems["improper_tags"].items(): + if tag_issues > 0: + locale_improper_tags.add(problem_string) locales_to_strings[locale] = { "br": list(locale_missing_br_tags), "b": list(locale_missing_b_tags), "span": list(locale_missing_span_tags), + "disallowed_tags": list(locale_disallowed_tags), + "improper_tags": list(locale_improper_tags), } if locales_to_strings[locale]["br"] == []: @@ -217,11 +248,16 @@ if problems: del locales_to_strings[locale]["b"] if locales_to_strings[locale]["span"] == []: del locales_to_strings[locale]["span"] + if locales_to_strings[locale]["disallowed_tags"] == []: + del locales_to_strings[locale]["disallowed_tags"] + if locales_to_strings[locale]["improper_tags"] == []: + del locales_to_strings[locale]["improper_tags"] console.info(f"Problem strings: {json.dumps(locales_to_strings, indent=2)}") message += " See above for problem strings and which locales they are in." for locale, locale_strings in locales_to_strings.items(): printed_locale = False + printed_problem_strings = set() for tag_type, tag_strings in locale_strings.items(): if tag_strings: if locale in ignored_strings_formatting and tag_strings == ignored_strings_formatting[locale]: @@ -230,9 +266,11 @@ if problems: print(f"{locale} - [Link Here](https://crowdin.com/editor/session-crossplatform-strings/300/en-{locale})") printed_locale = True for tag_string in tag_strings: - number_of_tag_problems += 1 - print( - f"- [{tag_string}](https://crowdin.com/editor/session-crossplatform-strings/300/en-{locale}?view=comfortable&filter=basic&value=3#q={tag_string})") + if tag_string not in printed_problem_strings: + printed_problem_strings.add(tag_string) + number_of_tag_problems += 1 + print( + f"- [{tag_string}](https://crowdin.com/editor/session-crossplatform-strings/300/en-{locale}?view=comfortable&filter=basic&value=3#q={tag_string})") print(f"Total Problems: {number_of_tag_problems}") if args.print_problems: