diff --git a/scripts/brazilian-number-processing.py b/scripts/brazilian-number-processing.py new file mode 100644 index 0000000..2638a99 --- /dev/null +++ b/scripts/brazilian-number-processing.py @@ -0,0 +1,100 @@ +import re +import argparse + +def process_phone_number(raw_phone): + """ + Process the raw phone string from the VCARD and return two formatted numbers: + - The original formatted number, and + - A modified formatted number with the extra (ninth) digit removed, if applicable. + + Desired output: + For a number with a 9-digit subscriber: + Original: "+55 {area} {first 5 of subscriber}-{last 4 of subscriber}" + Modified: "+55 {area} {subscriber[1:5]}-{subscriber[5:]}" + For example, for an input that should represent "027912345678", the outputs are: + "+55 27 91234-5678" and "+55 27 1234-5678" + + This function handles numbers that may already include a "+55" prefix. + It expects that after cleaning, a valid number (without the country code) should have either 10 digits + (2 for area + 8 for subscriber) or 11 digits (2 for area + 9 for subscriber). + If extra digits are present, it takes the last 11 (or 10) digits. + """ + # If the number starts with '+55', remove it for processing. + number_to_process = raw_phone.strip() + if number_to_process.startswith("+55"): + number_to_process = number_to_process[3:].strip() + + # Remove all non-digit characters. + digits = re.sub(r'\D', '', number_to_process) + + # Remove trunk zero if present. + if digits.startswith("0"): + digits = digits[1:] + + # After cleaning, we expect a valid number to have either 10 or 11 digits. + # If there are extra digits, use the last 11 (for a 9-digit subscriber) or last 10 (for an 8-digit subscriber). + if len(digits) > 11: + # Here, we assume the valid number is the last 11 digits. + digits = digits[-11:] + elif len(digits) == 12: + # In some cases with an 8-digit subscriber, take the last 10 digits. + digits = digits[-10:] + + if len(digits) not in (10, 11): + return None, None + + area = digits[:2] + subscriber = digits[2:] + + if len(subscriber) == 9: + # Format the original number (5-4 split, e.g., "91234-5678") + orig_subscriber = f"{subscriber[:5]}-{subscriber[5:]}" + # Create a modified version: drop the first digit of the subscriber to form an 8-digit subscriber (4-4 split) + mod_subscriber = f"{subscriber[1:5]}-{subscriber[5:]}" + original_formatted = f"+55 {area} {orig_subscriber}" + modified_formatted = f"+55 {area} {mod_subscriber}" + elif len(subscriber) == 8: + original_formatted = f"+55 {area} {subscriber[:4]}-{subscriber[4:]}" + modified_formatted = None + + return original_formatted, modified_formatted + +def process_vcard(input_vcard, output_vcard): + with open(input_vcard, 'r', encoding='utf-8') as file: + lines = file.readlines() + + output_lines = [] + + # Regex to capture any telephone line. + # It matches lines starting with "TEL:" or "TEL;TYPE=..." or with prefixes like "item1.TEL:". + phone_pattern = re.compile(r'^(?P(?:TEL(?:;TYPE=[^:]+)?|(?:.*\.)?TEL)):(?P.*)$') + + for line in lines: + stripped_line = line.rstrip("\n") + match = phone_pattern.match(stripped_line) + if match: + raw_phone = match.group("number").strip() + orig_formatted, mod_formatted = process_phone_number(raw_phone) + if orig_formatted: + # Always output using the standardized prefix. + output_lines.append(f"TEL;TYPE=CELL:{orig_formatted}\n") + else: + output_lines.append(line) + if mod_formatted: + output_lines.append(f"TEL;TYPE=CELL:{mod_formatted}\n") + else: + output_lines.append(line) + + with open(output_vcard, 'w', encoding='utf-8') as file: + file.writelines(output_lines) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Process a VCARD file to standardize telephone entries and add a second TEL line with the modified number (removing the extra ninth digit) for contacts with 9-digit subscribers." + ) + parser.add_argument('input_vcard', type=str, help='Input VCARD file') + parser.add_argument('output_vcard', type=str, help='Output VCARD file') + args = parser.parse_args() + + process_vcard(args.input_vcard, args.output_vcard) + print(f"VCARD processed and saved to {args.output_vcard}") \ No newline at end of file