diff --git a/README.md b/README.md index a1e0830..b0e1e22 100644 --- a/README.md +++ b/README.md @@ -145,55 +145,73 @@ After extracting, you will get these: Invoke the wtsexporter with --help option will show you all options available. ```sh > wtsexporter --help -usage: wtsexporter [-h] [-a] [-i] [-e EXPORTED] [-w WA] [-m MEDIA] [-b BACKUP] [-o OUTPUT] [-j [JSON]] - [--avoid-encoding-json] [--pretty-print-json [PRETTY_PRINT_JSON]] [-d DB] [-k KEY] [-t TEMPLATE] - [-s] [-c] [--offline OFFLINE] [--size [SIZE]] [--no-html] [--check-update] [--assume-first-as-me] - [--no-avatar] [--import] [--business] [--wab WAB] [--time-offset {-12 to 14}] [--date DATE] +usage: wtsexporter [-h] [-a] [-i] [-e EXPORTED] [-w WA] [-m MEDIA] [-b BACKUP] [-d DB] [-k [KEY]] + [--call-db [CALL_DB_IOS]] [--wab WAB] [-o OUTPUT] [-j [JSON]] [--txt [TEXT_FORMAT]] [--no-html] + [--size [SIZE]] [--avoid-encoding-json] [--pretty-print-json [PRETTY_PRINT_JSON]] [--per-chat] + [--import] [-t TEMPLATE] [--offline OFFLINE] [--no-avatar] [--experimental-new-theme] + [--headline HEADLINE] [-c] [--create-separated-media] [--time-offset {-12 to 14}] [--date DATE] [--date-format FORMAT] [--include [phone number ...]] [--exclude [phone number ...]] - [--dont-filter-empty] [--per-chat] [--create-separated-media] - [--decrypt-chunk-size DECRYPT_CHUNK_SIZE] [--enrich-from-vcards ENRICH_FROM_VCARDS] - [--default-country-code DEFAULT_CONTRY_CODE] [--txt [TEXT_FORMAT]] [--experimental-new-theme] - [--call-db [CALL_DB_IOS]] [--headline HEADLINE] + [--dont-filter-empty] [--enrich-from-vcards ENRICH_FROM_VCARDS] + [--default-country-code DEFAULT_COUNTRY_CODE] [-s] [--check-update] [--assume-first-as-me] + [--business] [--decrypt-chunk-size DECRYPT_CHUNK_SIZE] + [--max-bruteforce-worker MAX_BRUTEFORCE_WORKER] A customizable Android and iOS/iPadOS WhatsApp database parser that will give you the history of your WhatsApp conversations in HTML and JSON. Android Backup Crypt12, Crypt14 and Crypt15 supported. options: -h, --help show this help message and exit + +Device Type: -a, --android Define the target as Android - -i, --ios, Define the target as iPhone/iPad - -e EXPORTED, --exported EXPORTED + -i, --ios Define the target as iPhone/iPad + -e, --exported EXPORTED Define the target as exported chat file and specify the path to the file - -w WA, --wa WA Path to contact database (default: wa.db/ContactsV2.sqlite) - -m MEDIA, --media MEDIA - Path to WhatsApp media folder (default: WhatsApp) - -b BACKUP, --backup BACKUP - Path to Android (must be used together with -k)/iOS WhatsApp backup - -o OUTPUT, --output OUTPUT - Output to specific directory (default: result) - -j [JSON], --json [JSON] - Save the result to a single JSON file (default if present: result.json) + +Input Files: + -w, --wa WA Path to contact database (default: wa.db/ContactsV2.sqlite) + -m, --media MEDIA Path to WhatsApp media folder (default: WhatsApp) + -b, --backup BACKUP Path to Android (must be used together with -k)/iOS WhatsApp backup + -d, --db DB Path to database file (default: msgstore.db/7c7fba66680ef796b916b067077cc246adacf01d) + -k, --key [KEY] Path to key file. If this option is set for crypt15 backup but nothing is specified, you will + be prompted to enter the key. + --call-db [CALL_DB_IOS] + Path to call database (default: 1b432994e958845fffe8e2f190f26d1511534088) iOS only + --wab, --wa-backup WAB + Path to contact database in crypt15 format + +Output Options: + -o, --output OUTPUT Output to specific directory (default: result) + -j, --json [JSON] Save the result to a single JSON file (default if present: result.json) + --txt [TEXT_FORMAT] Export chats in text format similar to what WhatsApp officially provided (default if present: + result/) + --no-html Do not output html files + --size, --output-size, --split [SIZE] + Maximum (rough) size of a single output file in bytes, 0 for auto + +JSON Options: --avoid-encoding-json Don't encode non-ascii characters in the output JSON files --pretty-print-json [PRETTY_PRINT_JSON] Pretty print the output JSON. - -d DB, --db DB Path to database file (default: msgstore.db/7c7fba66680ef796b916b067077cc246adacf01d) - -k KEY, --key KEY Path to key file - -t TEMPLATE, --template TEMPLATE - Path to custom HTML template - -s, --showkey Show the HEX key used to decrypt the database - -c, --move-media Move the media directory to output directory if the flag is set, otherwise copy it - --offline OFFLINE Relative path to offline static files - --size [SIZE], --output-size [SIZE], --split [SIZE] - Maximum (rough) size of a single output file in bytes, 0 for auto - --no-html Do not output html files - --check-update Check for updates (require Internet access) - --assume-first-as-me Assume the first message in a chat as sent by me (must be used together with -e) - --no-avatar Do not render avatar in HTML output + --per-chat Output the JSON file per chat --import Import JSON file and convert to HTML output - --business Use Whatsapp Business default files (iOS only) - --wab WAB, --wa-backup WAB - Path to contact database in crypt15 format + +HTML Options: + -t, --template TEMPLATE + Path to custom HTML template + --offline OFFLINE Relative path to offline static files + --no-avatar Do not render avatar in HTML output + --experimental-new-theme + Use the newly designed WhatsApp-alike theme + --headline HEADLINE The custom headline for the HTML output. Use '??' as a placeholder for the chat name + +Media Handling: + -c, --move-media Move the media directory to output directory if the flag is set, otherwise copy it + --create-separated-media + Create a copy of the media seperated per chat in /separated/ directory + +Filtering Options: --time-offset {-12 to 14} Offset in hours (-12 to 14) for time displayed in the output --date DATE The date filter in specific format (inclusive) @@ -204,26 +222,26 @@ options: Exclude chats that match the supplied phone number --dont-filter-empty By default, the exporter will not render chats with no valid message. Setting this flag will cause the exporter to render those. This is useful if chat(s) are missing from the output - --per-chat Output the JSON file per chat - --create-separated-media - Create a copy of the media seperated per chat in /separated/ directory - --decrypt-chunk-size DECRYPT_CHUNK_SIZE - Specify the chunk size for decrypting iOS backup, which may affect the decryption speed. + +Contact Enrichment: --enrich-from-vcards ENRICH_FROM_VCARDS Path to an exported vcf file from Google contacts export. Add names missing from WhatsApp's default database - --default-country-code DEFAULT_CONTRY_CODE + --default-country-code DEFAULT_COUNTRY_CODE Use with --enrich-from-vcards. When numbers in the vcf file does not have a country code, this will be used. 1 is for US, 66 for Thailand etc. Most likely use the number of your own country - --txt [TEXT_FORMAT] Export chats in text format similar to what WhatsApp officially provided (default if present: - result/) - --experimental-new-theme - Use the newly designed WhatsApp-alike theme - --call-db [CALL_DB_IOS] - Path to call database (default: 1b432994e958845fffe8e2f190f26d1511534088) iOS only - --headline HEADLINE The custom headline for the HTML output. Use '??' as a placeholder for the chat name -WhatsApp Chat Exporter: 0.11.2 Licensed with MIT. See https://wts.knugi.dev/docs?dest=osl for all open source +Miscellaneous: + -s, --showkey Show the HEX key used to decrypt the database + --check-update Check for updates (require Internet access) + --assume-first-as-me Assume the first message in a chat as sent by me (must be used together with -e) + --business Use Whatsapp Business default files (iOS only) + --decrypt-chunk-size DECRYPT_CHUNK_SIZE + Specify the chunk size for decrypting iOS backup, which may affect the decryption speed. + --max-bruteforce-worker MAX_BRUTEFORCE_WORKER + Specify the maximum number of worker for bruteforce decryption. + +WhatsApp Chat Exporter: 0.12.0 Licensed with MIT. See https://wts.knugi.dev/docs?dest=osl for all open source licenses. ``` diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index d8d9b2a..bf130be 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -7,6 +7,20 @@ import shutil import json import string import glob +import importlib.metadata +from Whatsapp_Chat_Exporter import android_crypt, exported_handler, android_handler +from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler +from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, Crypt, check_update, DbType +from Whatsapp_Chat_Exporter.utility import readable_to_bytes, sanitize_filename +from Whatsapp_Chat_Exporter.utility import import_from_json, bytes_to_readable +from argparse import ArgumentParser, SUPPRESS +from datetime import datetime +from getpass import getpass +from sys import exit +from typing import Tuple, Optional, List, Dict, Any, Union + +# Try to import vobject for contacts processing try: import vobject except ModuleNotFoundError: @@ -14,321 +28,218 @@ except ModuleNotFoundError: else: from Whatsapp_Chat_Exporter.vcards_contacts import ContactsFromVCards vcards_deps_installed = True -from Whatsapp_Chat_Exporter import exported_handler, android_handler -from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler -from Whatsapp_Chat_Exporter.data_model import ChatStore -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, Crypt, DbType, readable_to_bytes, check_update -from Whatsapp_Chat_Exporter.utility import import_from_json, sanitize_filename, bytes_to_readable -from argparse import ArgumentParser, SUPPRESS -from datetime import datetime -from sys import exit -import importlib.metadata -def main(): +def setup_argument_parser() -> ArgumentParser: + """Set up and return the argument parser with all options.""" parser = ArgumentParser( - description = 'A customizable Android and iOS/iPadOS WhatsApp database parser that ' - 'will give you the history of your WhatsApp conversations in HTML ' - 'and JSON. Android Backup Crypt12, Crypt14 and Crypt15 supported.', - epilog = f'WhatsApp Chat Exporter: {importlib.metadata.version("whatsapp_chat_exporter")} Licensed with MIT. See ' - 'https://wts.knugi.dev/docs?dest=osl for all open source licenses.' + description='A customizable Android and iOS/iPadOS WhatsApp database parser that ' + 'will give you the history of your WhatsApp conversations in HTML ' + 'and JSON. Android Backup Crypt12, Crypt14 and Crypt15 supported.', + epilog=f'WhatsApp Chat Exporter: {importlib.metadata.version("whatsapp_chat_exporter")} Licensed with MIT. See ' + 'https://wts.knugi.dev/docs?dest=osl for all open source licenses.' ) - parser.add_argument( - '-a', - '--android', - dest='android', - default=False, - action='store_true', - help="Define the target as Android") - parser.add_argument( - '-i', - '--ios', - dest='ios', - default=False, - action='store_true', - help="Define the target as iPhone/iPad") - parser.add_argument( - "-e", - "--exported", - dest="exported", - default=None, + + # Device type arguments + device_group = parser.add_argument_group('Device Type') + device_group.add_argument( + '-a', '--android', dest='android', default=False, action='store_true', + help="Define the target as Android" + ) + device_group.add_argument( + '-i', '--ios', dest='ios', default=False, action='store_true', + help="Define the target as iPhone/iPad" + ) + device_group.add_argument( + "-e", "--exported", dest="exported", default=None, help="Define the target as exported chat file and specify the path to the file" ) - parser.add_argument( - "-w", - "--wa", - dest="wa", - default=None, - help="Path to contact database (default: wa.db/ContactsV2.sqlite)") - parser.add_argument( - "-m", - "--media", - dest="media", - default=None, - help="Path to WhatsApp media folder (default: WhatsApp)") - parser.add_argument( - "-b", - "--backup", - dest="backup", - default=None, - help="Path to Android (must be used together " - "with -k)/iOS WhatsApp backup") - parser.add_argument( - "-o", - "--output", - dest="output", - default="result", - help="Output to specific directory (default: result)") - parser.add_argument( - '-j', - '--json', - dest='json', - nargs='?', - default=None, - type=str, - const="result.json", - help="Save the result to a single JSON file (default if present: result.json)") - parser.add_argument( - '--avoid-encoding-json', - dest='avoid_encoding_json', - default=False, - action='store_true', - help="Don't encode non-ascii characters in the output JSON files") - parser.add_argument( - '--pretty-print-json', - dest='pretty_print_json', - default=None, - nargs='?', - const=2, - type=int, - help="Pretty print the output JSON.") - parser.add_argument( - '-d', - '--db', - dest='db', - default=None, - help="Path to database file (default: msgstore.db/" - "7c7fba66680ef796b916b067077cc246adacf01d)") - parser.add_argument( - '-k', - '--key', - dest='key', - default=None, - help="Path to key file" + + # Input file paths + input_group = parser.add_argument_group('Input Files') + input_group.add_argument( + "-w", "--wa", dest="wa", default=None, + help="Path to contact database (default: wa.db/ContactsV2.sqlite)" ) - parser.add_argument( - "-t", - "--template", - dest="template", - default=None, - help="Path to custom HTML template" + input_group.add_argument( + "-m", "--media", dest="media", default=None, + help="Path to WhatsApp media folder (default: WhatsApp)" ) - parser.add_argument( - "--embedded", - dest="embedded", - default=False, - action='store_true', - help=SUPPRESS or "Embed media into HTML file (not yet implemented)" + input_group.add_argument( + "-b", "--backup", dest="backup", default=None, + help="Path to Android (must be used together with -k)/iOS WhatsApp backup" ) - parser.add_argument( - "-s", - "--showkey", - dest="showkey", - default=False, - action='store_true', - help="Show the HEX key used to decrypt the database" + input_group.add_argument( + "-d", "--db", dest="db", default=None, + help="Path to database file (default: msgstore.db/7c7fba66680ef796b916b067077cc246adacf01d)" ) - parser.add_argument( - "-c", - "--move-media", - dest="move_media", - default=False, - action='store_true', - help="Move the media directory to output directory if the flag is set, otherwise copy it" + input_group.add_argument( + "-k", "--key", dest="key", default=None, nargs='?', + help="Path to key file. If this option is set for crypt15 backup but nothing is specified, you will be prompted to enter the key." ) - parser.add_argument( - "--offline", - dest="offline", - default=None, - help="Relative path to offline static files" + input_group.add_argument( + "--call-db", dest="call_db_ios", nargs='?', default=None, type=str, + const="1b432994e958845fffe8e2f190f26d1511534088", + help="Path to call database (default: 1b432994e958845fffe8e2f190f26d1511534088) iOS only" ) - parser.add_argument( - "--size", - "--output-size", - "--split", - dest="size", - nargs='?', - const=0, - default=None, - help="Maximum (rough) size of a single output file in bytes, 0 for auto" - ) - parser.add_argument( - "--no-html", - dest="no_html", - default=False, - action='store_true', - help="Do not output html files" - ) - parser.add_argument( - "--check-update", - dest="check_update", - default=False, - action='store_true', - help="Check for updates (require Internet access)" - ) - parser.add_argument( - "--assume-first-as-me", - dest="assume_first_as_me", - default=False, - action='store_true', - help="Assume the first message in a chat as sent by me (must be used together with -e)" - ) - parser.add_argument( - "--no-avatar", - dest="no_avatar", - default=False, - action='store_true', - help="Do not render avatar in HTML output" - ) - parser.add_argument( - "--import", - dest="import_json", - default=False, - action='store_true', - help="Import JSON file and convert to HTML output" - ) - parser.add_argument( - "--business", - dest="business", - default=False, - action='store_true', - help="Use Whatsapp Business default files (iOS only)" - ) - parser.add_argument( - "--wab", - "--wa-backup", - dest="wab", - default=None, + input_group.add_argument( + "--wab", "--wa-backup", dest="wab", default=None, help="Path to contact database in crypt15 format" ) - parser.add_argument( - "--time-offset", - dest="timezone_offset", - default=0, - type=int, - choices=range(-12, 15), - metavar="{-12 to 14}", - help="Offset in hours (-12 to 14) for time displayed in the output" + + # Output options + output_group = parser.add_argument_group('Output Options') + output_group.add_argument( + "-o", "--output", dest="output", default="result", + help="Output to specific directory (default: result)" ) - parser.add_argument( - "--date", - dest="filter_date", - default=None, - metavar="DATE", + output_group.add_argument( + '-j', '--json', dest='json', nargs='?', default=None, type=str, const="result.json", + help="Save the result to a single JSON file (default if present: result.json)" + ) + output_group.add_argument( + "--txt", dest="text_format", nargs='?', default=None, type=str, const="result", + help="Export chats in text format similar to what WhatsApp officially provided (default if present: result/)" + ) + output_group.add_argument( + "--no-html", dest="no_html", default=False, action='store_true', + help="Do not output html files" + ) + output_group.add_argument( + "--size", "--output-size", "--split", dest="size", nargs='?', const=0, default=None, + help="Maximum (rough) size of a single output file in bytes, 0 for auto" + ) + + # JSON formatting options + json_group = parser.add_argument_group('JSON Options') + json_group.add_argument( + '--avoid-encoding-json', dest='avoid_encoding_json', default=False, action='store_true', + help="Don't encode non-ascii characters in the output JSON files" + ) + json_group.add_argument( + '--pretty-print-json', dest='pretty_print_json', default=None, nargs='?', const=2, type=int, + help="Pretty print the output JSON." + ) + json_group.add_argument( + "--per-chat", dest="json_per_chat", default=False, action='store_true', + help="Output the JSON file per chat" + ) + json_group.add_argument( + "--import", dest="import_json", default=False, action='store_true', + help="Import JSON file and convert to HTML output" + ) + + # HTML options + html_group = parser.add_argument_group('HTML Options') + html_group.add_argument( + "-t", "--template", dest="template", default=None, + help="Path to custom HTML template" + ) + html_group.add_argument( + "--embedded", dest="embedded", default=False, action='store_true', + help=SUPPRESS or "Embed media into HTML file (not yet implemented)" + ) + html_group.add_argument( + "--offline", dest="offline", default=None, + help="Relative path to offline static files" + ) + html_group.add_argument( + "--no-avatar", dest="no_avatar", default=False, action='store_true', + help="Do not render avatar in HTML output" + ) + html_group.add_argument( + "--experimental-new-theme", dest="whatsapp_theme", default=False, action='store_true', + help="Use the newly designed WhatsApp-alike theme" + ) + html_group.add_argument( + "--headline", dest="headline", default="Chat history with ??", + help="The custom headline for the HTML output. Use '??' as a placeholder for the chat name" + ) + + # Media handling + media_group = parser.add_argument_group('Media Handling') + media_group.add_argument( + "-c", "--move-media", dest="move_media", default=False, action='store_true', + help="Move the media directory to output directory if the flag is set, otherwise copy it" + ) + media_group.add_argument( + "--create-separated-media", dest="separate_media", default=False, action='store_true', + help="Create a copy of the media seperated per chat in /separated/ directory" + ) + + # Filtering options + filter_group = parser.add_argument_group('Filtering Options') + filter_group.add_argument( + "--time-offset", dest="timezone_offset", default=0, type=int, choices=range(-12, 15), + metavar="{-12 to 14}", help="Offset in hours (-12 to 14) for time displayed in the output" + ) + filter_group.add_argument( + "--date", dest="filter_date", default=None, metavar="DATE", help="The date filter in specific format (inclusive)" ) - parser.add_argument( - "--date-format", - dest="filter_date_format", - default="%Y-%m-%d %H:%M", - metavar="FORMAT", + filter_group.add_argument( + "--date-format", dest="filter_date_format", default="%Y-%m-%d %H:%M", metavar="FORMAT", help="The date format for the date filter" ) - parser.add_argument( - "--include", - dest="filter_chat_include", - nargs='*', - metavar="phone number", + filter_group.add_argument( + "--include", dest="filter_chat_include", nargs='*', metavar="phone number", help="Include chats that match the supplied phone number" ) - parser.add_argument( - "--exclude", - dest="filter_chat_exclude", - nargs='*', - metavar="phone number", + filter_group.add_argument( + "--exclude", dest="filter_chat_exclude", nargs='*', metavar="phone number", help="Exclude chats that match the supplied phone number" ) - parser.add_argument( - "--dont-filter-empty", - dest="filter_empty", - default=True, - action='store_false', + filter_group.add_argument( + "--dont-filter-empty", dest="filter_empty", default=True, action='store_false', help=("By default, the exporter will not render chats with no valid message. " "Setting this flag will cause the exporter to render those. " "This is useful if chat(s) are missing from the output") ) - parser.add_argument( - "--per-chat", - dest="json_per_chat", - default=False, - action='store_true', - help="Output the JSON file per chat" - ) - parser.add_argument( - "--create-separated-media", - dest="separate_media", - default=False, - action='store_true', - help="Create a copy of the media seperated per chat in /separated/ directory" - ) - parser.add_argument( - "--decrypt-chunk-size", - dest="decrypt_chunk_size", - default=1 * 1024 * 1024, - type=int, - help="Specify the chunk size for decrypting iOS backup, which may affect the decryption speed." - ) - parser.add_argument( - "--enrich-from-vcards", - dest="enrich_from_vcards", - default=None, + + # Contact enrichment + contact_group = parser.add_argument_group('Contact Enrichment') + contact_group.add_argument( + "--enrich-from-vcards", dest="enrich_from_vcards", default=None, help="Path to an exported vcf file from Google contacts export. Add names missing from WhatsApp's default database" ) - parser.add_argument( - "--default-country-code", - dest="default_contry_code", - default=None, + contact_group.add_argument( + "--default-country-code", dest="default_country_code", default=None, help="Use with --enrich-from-vcards. When numbers in the vcf file does not have a country code, this will be used. 1 is for US, 66 for Thailand etc. Most likely use the number of your own country" ) - parser.add_argument( - "--txt", - dest="text_format", - nargs='?', - default=None, - type=str, - const="result", - help="Export chats in text format similar to what WhatsApp officially provided (default if present: result/)" + + # Miscellaneous + misc_group = parser.add_argument_group('Miscellaneous') + misc_group.add_argument( + "-s", "--showkey", dest="showkey", default=False, action='store_true', + help="Show the HEX key used to decrypt the database" ) - parser.add_argument( - "--experimental-new-theme", - dest="whatsapp_theme", - default=False, - action='store_true', - help="Use the newly designed WhatsApp-alike theme" + misc_group.add_argument( + "--check-update", dest="check_update", default=False, action='store_true', + help="Check for updates (require Internet access)" ) - parser.add_argument( - "--call-db", - dest="call_db_ios", - nargs='?', - default=None, - type=str, - const="1b432994e958845fffe8e2f190f26d1511534088", - help="Path to call database (default: 1b432994e958845fffe8e2f190f26d1511534088) iOS only" + misc_group.add_argument( + "--assume-first-as-me", dest="assume_first_as_me", default=False, action='store_true', + help="Assume the first message in a chat as sent by me (must be used together with -e)" ) - parser.add_argument( - "--headline", - dest="headline", - default="Chat history with ??", - help="The custom headline for the HTML output. Use '??' as a placeholder for the chat name" + misc_group.add_argument( + "--business", dest="business", default=False, action='store_true', + help="Use Whatsapp Business default files (iOS only)" ) + misc_group.add_argument( + "--decrypt-chunk-size", dest="decrypt_chunk_size", default=1 * 1024 * 1024, type=int, + help="Specify the chunk size for decrypting iOS backup, which may affect the decryption speed." + ) + misc_group.add_argument( + "--max-bruteforce-worker", dest="max_bruteforce_worker", default=10, type=int, + help="Specify the maximum number of worker for bruteforce decryption." + ) + + return parser - args = parser.parse_args() - # Check for updates - if args.check_update: - exit(check_update()) - - # Sanity checks +def validate_args(parser: ArgumentParser, args) -> None: + """Validate command line arguments and modify them if needed.""" + # Basic validation checks if args.android and args.ios and args.exported and args.import_json: parser.error("You must define only one device type.") if not args.android and not args.ios and not args.exported and not args.import_json: @@ -343,235 +254,412 @@ def main(): parser.error("WhatsApp Business is only available on iOS for now.") if "??" not in args.headline: parser.error("--headline must contain '??' for replacement.") - if args.json_per_chat and ( - (args.json[-5:] != ".json" and os.path.isfile(args.json)) or \ - (args.json[-5:] == ".json" and os.path.isfile(args.json[:-5])) + + # JSON validation + if args.json_per_chat and args.json and ( + (args.json.endswith(".json") and os.path.isfile(args.json)) or + (not args.json.endswith(".json") and os.path.isfile(args.json)) ): parser.error("When --per-chat is enabled, the destination of --json must be a directory.") - if args.enrich_from_vcards is not None and args.default_contry_code is None: + + # vCards validation + if args.enrich_from_vcards is not None and args.default_country_code is None: parser.error("When --enrich-from-vcards is provided, you must also set --default-country-code") + + # Size validation if args.size is not None and not isinstance(args.size, int) and not args.size.isnumeric(): try: args.size = readable_to_bytes(args.size) except ValueError: parser.error("The value for --split must be ended in pure bytes or with a proper unit (e.g., 1048576 or 1MB)") + + # Date filter validation and processing if args.filter_date is not None: - if " - " in args.filter_date: - start, end = args.filter_date.split(" - ") - start = int(datetime.strptime(start, args.filter_date_format).timestamp()) - end = int(datetime.strptime(end, args.filter_date_format).timestamp()) - if start < 1009843200 or end < 1009843200: - parser.error("WhatsApp was first released in 2009...") - if start > end: - parser.error("The start date cannot be a moment after the end date.") - if args.android: - args.filter_date = f"BETWEEN {start}000 AND {end}000" - elif args.ios: - args.filter_date = f"BETWEEN {start - APPLE_TIME} AND {end - APPLE_TIME}" - else: - _timestamp = int(datetime.strptime(args.filter_date[2:], args.filter_date_format).timestamp()) - if _timestamp < 1009843200: - parser.error("WhatsApp was first released in 2009...") - if args.filter_date[:2] == "> ": - if args.android: - args.filter_date = f">= {_timestamp}000" - elif args.ios: - args.filter_date = f">= {_timestamp - APPLE_TIME}" - elif args.filter_date[:2] == "< ": - if args.android: - args.filter_date = f"<= {_timestamp}000" - elif args.ios: - args.filter_date = f"<= {_timestamp - APPLE_TIME}" - else: - parser.error("Unsupported date format. See https://wts.knugi.dev/docs?dest=date") + process_date_filter(parser, args) + + # Crypt15 key validation + if args.key is None and args.backup is not None and args.backup.endswith("crypt15"): + args.key = getpass("Enter your encryption key: ") + + # Theme validation if args.whatsapp_theme: args.template = "whatsapp_new.html" + + # Chat filter validation if args.filter_chat_include is not None and args.filter_chat_exclude is not None: parser.error("Chat inclusion and exclusion filters cannot be used together.") - if args.filter_chat_include is not None: - for chat in args.filter_chat_include: + + validate_chat_filters(parser, args.filter_chat_include) + validate_chat_filters(parser, args.filter_chat_exclude) + + +def validate_chat_filters(parser: ArgumentParser, chat_filter: Optional[List[str]]) -> None: + """Validate chat filters to ensure they contain only phone numbers.""" + if chat_filter is not None: + for chat in chat_filter: if not chat.isnumeric(): parser.error("Enter a phone number in the chat filter. See https://wts.knugi.dev/docs?dest=chat") - if args.filter_chat_exclude is not None: - for chat in args.filter_chat_exclude: - if not chat.isnumeric(): - parser.error("Enter a phone number in the chat filter. See https://wts.knugi.dev/docs?dest=chat") - filter_chat = (args.filter_chat_include, args.filter_chat_exclude) - data = {} +def process_date_filter(parser: ArgumentParser, args) -> None: + """Process and validate date filter arguments.""" + if " - " in args.filter_date: + start, end = args.filter_date.split(" - ") + start = int(datetime.strptime(start, args.filter_date_format).timestamp()) + end = int(datetime.strptime(end, args.filter_date_format).timestamp()) + + if start < 1009843200 or end < 1009843200: + parser.error("WhatsApp was first released in 2009...") + if start > end: + parser.error("The start date cannot be a moment after the end date.") + + if args.android: + args.filter_date = f"BETWEEN {start}000 AND {end}000" + elif args.ios: + args.filter_date = f"BETWEEN {start - APPLE_TIME} AND {end - APPLE_TIME}" + else: + process_single_date_filter(parser, args) + + +def process_single_date_filter(parser: ArgumentParser, args) -> None: + """Process single date comparison filters.""" + if len(args.filter_date) < 3: + parser.error("Unsupported date format. See https://wts.knugi.dev/docs?dest=date") + + _timestamp = int(datetime.strptime(args.filter_date[2:], args.filter_date_format).timestamp()) + + if _timestamp < 1009843200: + parser.error("WhatsApp was first released in 2009...") + + if args.filter_date[:2] == "> ": + if args.android: + args.filter_date = f">= {_timestamp}000" + elif args.ios: + args.filter_date = f">= {_timestamp - APPLE_TIME}" + elif args.filter_date[:2] == "< ": + if args.android: + args.filter_date = f"<= {_timestamp}000" + elif args.ios: + args.filter_date = f"<= {_timestamp - APPLE_TIME}" + else: + parser.error("Unsupported date format. See https://wts.knugi.dev/docs?dest=date") + + +def setup_contact_store(args) -> Optional['ContactsFromVCards']: + """Set up and return a contact store if needed.""" if args.enrich_from_vcards is not None: if not vcards_deps_installed: - parser.error( + print( "You don't have the dependency to enrich contacts with vCard.\n" "Read more on how to deal with enriching contacts:\n" "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage" ) + exit(1) contact_store = ContactsFromVCards() - contact_store.load_vcf_file(args.enrich_from_vcards, args.default_contry_code) + contact_store.load_vcf_file(args.enrich_from_vcards, args.default_country_code) + return contact_store + return None + +def decrypt_android_backup(args) -> int: + """Decrypt Android backup files and return error code.""" + if args.key is None or args.backup is None: + print("You must specify the backup file with -b and a key with -k") + return 1 + + print("Decryption key specified, decrypting WhatsApp backup...") + + # Determine crypt type + if "crypt12" in args.backup: + crypt = Crypt.CRYPT12 + elif "crypt14" in args.backup: + crypt = Crypt.CRYPT14 + elif "crypt15" in args.backup: + crypt = Crypt.CRYPT15 + else: + print("Unknown backup format. The backup file must be crypt12, crypt14 or crypt15.") + return 1 + + # Get key + keyfile_stream = False + if not os.path.isfile(args.key) and all(char in string.hexdigits for char in args.key.replace(" ", "")): + key = bytes.fromhex(args.key.replace(" ", "")) + else: + key = open(args.key, "rb") + keyfile_stream = True + + # Read backup + db = open(args.backup, "rb").read() + + # Process WAB if provided + error_wa = 0 + if args.wab: + wab = open(args.wab, "rb").read() + error_wa = android_crypt.decrypt_backup( + wab, + key, + args.wa, + crypt, + args.showkey, + DbType.CONTACT, + keyfile_stream=keyfile_stream, + max_worker=args.max_bruteforce_worker + ) + if isinstance(key, io.IOBase): + key.seek(0) + + # Decrypt message database + error_message = android_crypt.decrypt_backup( + db, + key, + args.db, + crypt, + args.showkey, + DbType.MESSAGE, + keyfile_stream=keyfile_stream, + max_worker=args.max_bruteforce_worker + ) + + # Handle errors + if error_wa != 0: + return error_wa + return error_message + + +def handle_decrypt_error(error: int) -> None: + """Handle decryption errors with appropriate messages.""" + if error == 1: + print("Dependencies of decrypt_backup and/or extract_encrypted_key" + " are not present. For details, see README.md.") + exit(3) + elif error == 2: + print("Failed when decompressing the decrypted backup. " + "Possibly incorrect offsets used in decryption.") + exit(4) + else: + print("Unknown error occurred.", error) + exit(5) + + +def process_contacts(args, data: ChatCollection, contact_store=None) -> None: + """Process contacts from the database.""" + contact_db = args.wa if args.wa else "wa.db" if args.android else "ContactsV2.sqlite" + + if os.path.isfile(contact_db): + with sqlite3.connect(contact_db) as db: + db.row_factory = sqlite3.Row + if args.android: + android_handler.contacts(db, data, args.enrich_from_vcards) + else: + ios_handler.contacts(db, data) + + +def process_messages(args, data: ChatCollection) -> None: + """Process messages, media and vcards from the database.""" + msg_db = args.db if args.db else "msgstore.db" if args.android else args.identifiers.MESSAGE + + if not os.path.isfile(msg_db): + print( + "The message database does not exist. You may specify the path " + "to database file with option -d or check your provided path." + ) + exit(6) + + filter_chat = (args.filter_chat_include, args.filter_chat_exclude) + + with sqlite3.connect(msg_db) as db: + db.row_factory = sqlite3.Row + + # Process messages + if args.android: + message_handler = android_handler + else: + message_handler = ios_handler + + message_handler.messages( + db, data, args.media, args.timezone_offset, + args.filter_date, filter_chat, args.filter_empty + ) + + # Process media + message_handler.media( + db, data, args.media, args.filter_date, + filter_chat, args.filter_empty, args.separate_media + ) + + # Process vcards + message_handler.vcard( + db, data, args.media, args.filter_date, + filter_chat, args.filter_empty + ) + + # Process calls + process_calls(args, db, data, filter_chat) + + +def process_calls(args, db, data: ChatCollection, filter_chat) -> None: + """Process call history if available.""" if args.android: - contacts = android_handler.contacts - messages = android_handler.messages - media = android_handler.media - vcard = android_handler.vcard - create_html = android_handler.create_html - if args.db is None: - msg_db = "msgstore.db" - else: - msg_db = args.db - if args.wa is None: - contact_db = "wa.db" - else: - contact_db = args.wa - if args.key is not None: - if args.backup is None: - print("You must specify the backup file with -b") - exit(1) - print("Decryption key specified, decrypting WhatsApp backup...") - if "crypt12" in args.backup: - crypt = Crypt.CRYPT12 - elif "crypt14" in args.backup: - crypt = Crypt.CRYPT14 - elif "crypt15" in args.backup: - crypt = Crypt.CRYPT15 - if os.path.isfile(args.key): - key = open(args.key, "rb") - elif all(char in string.hexdigits for char in args.key): - key = bytes.fromhex(args.key) - db = open(args.backup, "rb").read() - if args.wab: - wab = open(args.wab, "rb").read() - error_wa = android_handler.decrypt_backup(wab, key, contact_db, crypt, args.showkey, DbType.CONTACT) - if isinstance(key, io.IOBase): - key.seek(0) - else: - error_wa = 0 - error_message = android_handler.decrypt_backup(db, key, msg_db, crypt, args.showkey, DbType.MESSAGE) - if error_wa != 0: - error = error_wa - elif error_message != 0: - error = error_message - else: - error = 0 - if error != 0: - if error == 1: - print("Dependencies of decrypt_backup and/or extract_encrypted_key" - " are not present. For details, see README.md.") - exit(3) - elif error == 2: - print("Failed when decompressing the decrypted backup. " - "Possibly incorrect offsets used in decryption.") - exit(4) - else: - print("Unknown error occurred.", error) - exit(5) - if args.media is None: - args.media = "WhatsApp" + android_handler.calls(db, data, args.timezone_offset, filter_chat) + elif args.ios and args.call_db_ios is not None: + with sqlite3.connect(args.call_db_ios) as cdb: + cdb.row_factory = sqlite3.Row + ios_handler.calls(cdb, data, args.timezone_offset, filter_chat) - if os.path.isfile(contact_db): - with sqlite3.connect(contact_db) as db: - db.row_factory = sqlite3.Row - contacts(db, data) - elif args.ios: - contacts = ios_handler.contacts - messages = ios_handler.messages - media = ios_handler.media - vcard = ios_handler.vcard - create_html = android_handler.create_html - if args.business: - from Whatsapp_Chat_Exporter.utility import WhatsAppBusinessIdentifier as identifiers + +def handle_media_directory(args) -> None: + """Handle media directory copying or moving.""" + if os.path.isdir(args.media): + media_path = os.path.join(args.output, args.media) + + if os.path.isdir(media_path): + print("\nWhatsApp directory already exists in output directory. Skipping...", end="\n") else: - from Whatsapp_Chat_Exporter.utility import WhatsAppIdentifier as identifiers - if args.media is None: - args.media = identifiers.DOMAIN - if args.backup is not None: - if not os.path.isdir(args.media): - ios_media_handler.extract_media(args.backup, identifiers, args.decrypt_chunk_size) + if args.move_media: + try: + print("\nMoving media directory...", end="\n") + shutil.move(args.media, f"{args.output}/") + except PermissionError: + print("\nCannot remove original WhatsApp directory. " + "Perhaps the directory is opened?", end="\n") else: - print("WhatsApp directory already exists, skipping WhatsApp file extraction.") - if args.db is None: - msg_db = identifiers.MESSAGE - else: - msg_db = args.db - if args.wa is None: - contact_db = "ContactsV2.sqlite" - else: - contact_db = args.wa - if os.path.isfile(contact_db): - with sqlite3.connect(contact_db) as db: - db.row_factory = sqlite3.Row - contacts(db, data) + print("\nCopying media directory...", end="\n") + shutil.copytree(args.media, media_path) - if not args.exported and not args.import_json: - if os.path.isfile(msg_db): - with sqlite3.connect(msg_db) as db: - db.row_factory = sqlite3.Row - messages(db, data, args.media, args.timezone_offset, args.filter_date, filter_chat, args.filter_empty) - media(db, data, args.media, args.filter_date, filter_chat, args.filter_empty, args.separate_media) - vcard(db, data, args.media, args.filter_date, filter_chat, args.filter_empty) - if args.android: - android_handler.calls(db, data, args.timezone_offset, filter_chat) - elif args.ios and args.call_db_ios is not None: - with sqlite3.connect(args.call_db_ios) as cdb: - cdb.row_factory = sqlite3.Row - ios_handler.calls(cdb, data, args.timezone_offset, filter_chat) - if not args.no_html: - if args.enrich_from_vcards is not None and not contact_store.is_empty(): - contact_store.enrich_from_vcards(data) - create_html( - data, - args.output, - args.template, - args.embedded, - args.offline, - args.size, - args.no_avatar, - args.whatsapp_theme, - args.headline - ) +def create_output_files(args, data: ChatCollection, contact_store=None) -> None: + """Create output files in the specified formats.""" + # Create HTML files if requested + if not args.no_html: + # Enrich from vcards if available + if contact_store and not contact_store.is_empty(): + contact_store.enrich_from_vcards(data) + + android_handler.create_html( + data, + args.output, + args.template, + args.embedded, + args.offline, + args.size, + args.no_avatar, + args.whatsapp_theme, + args.headline + ) + + # Create text files if requested + if args.text_format: + print("Writing text file...") + android_handler.create_txt(data, args.text_format) + + # Create JSON files if requested + if args.json and not args.import_json: + export_json(args, data, contact_store) + + +def export_json(args, data: ChatCollection, contact_store=None) -> None: + """Export data to JSON format.""" + # Enrich from vcards if available + if contact_store and not contact_store.is_empty(): + contact_store.enrich_from_vcards(data) + + # Convert ChatStore objects to JSON + if isinstance(data.get(next(iter(data), None)), ChatStore): + data = {jik: chat.to_json() for jik, chat in data.items()} + + # Export as a single file or per chat + if not args.json_per_chat: + export_single_json(args, data) + else: + export_multiple_json(args, data) + + +def export_single_json(args, data: Dict) -> None: + """Export data to a single JSON file.""" + with open(args.json, "w") as f: + json_data = json.dumps( + data, + ensure_ascii=not args.avoid_encoding_json, + indent=args.pretty_print_json + ) + print(f"\nWriting JSON file...({bytes_to_readable(len(json_data))})") + f.write(json_data) + + +def export_multiple_json(args, data: Dict) -> None: + """Export data to multiple JSON files, one per chat.""" + # Adjust output path if needed + json_path = args.json[:-5] if args.json.endswith(".json") else args.json + + # Create directory if it doesn't exist + if not os.path.isdir(json_path): + os.makedirs(json_path, exist_ok=True) + + # Export each chat + total = len(data.keys()) + for index, jik in enumerate(data.keys()): + if data[jik]["name"] is not None: + contact = data[jik]["name"].replace('/', '') else: - print( - "The message database does not exist. You may specify the path " - "to database file with option -d or check your provided path." + contact = jik.replace('+', '') + + with open(f"{json_path}/{sanitize_filename(contact)}.json", "w") as f: + file_content = json.dumps( + {jik: data[jik]}, + ensure_ascii=not args.avoid_encoding_json, + indent=args.pretty_print_json ) - exit(6) + f.write(file_content) + print(f"Writing JSON file...({index + 1}/{total})", end="\r") + print() - if os.path.isdir(args.media): - media_path = os.path.join(args.output, args.media) - if os.path.isdir(media_path): - print("\nWhatsApp directory already exists in output directory. Skipping...", end="\n") - else: - if not args.move_media: - if os.path.isdir(media_path): - print("\nWhatsApp directory already exists in output directory. Skipping...", end="\n") - else: - print("\nCopying media directory...", end="\n") - shutil.copytree(args.media, media_path) - else: - try: - shutil.move(args.media, f"{args.output}/") - except PermissionError: - print("\nCannot remove original WhatsApp directory. " - "Perhaps the directory is opened?", end="\n") - elif args.exported: - exported_handler.messages(args.exported, data, args.assume_first_as_me) - if not args.no_html: - android_handler.create_html( - data, - args.output, - args.template, - args.embedded, - args.offline, - args.size, - args.no_avatar, - args.whatsapp_theme, - args.headline - ) - for file in glob.glob(r'*.*'): - shutil.copy(file, args.output) - elif args.import_json: + +def process_exported_chat(args, data: ChatCollection) -> None: + """Process an exported chat file.""" + exported_handler.messages(args.exported, data, args.assume_first_as_me) + + if not args.no_html: + android_handler.create_html( + data, + args.output, + args.template, + args.embedded, + args.offline, + args.size, + args.no_avatar, + args.whatsapp_theme, + args.headline + ) + + # Copy files to output directory + for file in glob.glob(r'*.*'): + shutil.copy(file, args.output) + + +def main(): + """Main function to run the WhatsApp Chat Exporter.""" + # Set up and parse arguments + parser = setup_argument_parser() + args = parser.parse_args() + + # Check for updates + if args.check_update: + exit(check_update()) + + # Validate arguments + validate_args(parser, args) + + # Create output directory if it doesn't exist + os.makedirs(args.output, exist_ok=True) + + # Initialize data collection + data = ChatCollection() + + # Set up contact store for vCard enrichment if needed + contact_store = setup_contact_store(args) + + if args.import_json: + # Import from JSON import_from_json(args.json, data) android_handler.create_html( data, @@ -584,48 +672,62 @@ def main(): args.whatsapp_theme, args.headline ) - - if args.text_format: - print("Writing text file...") - android_handler.create_txt(data, args.text_format) - - if args.json and not args.import_json: - if args.enrich_from_vcards is not None and not contact_store.is_empty(): - contact_store.enrich_from_vcards(data) - - if isinstance(data[next(iter(data))], ChatStore): - data = {jik: chat.to_json() for jik, chat in data.items()} - - if not args.json_per_chat: - with open(args.json, "w") as f: - data = json.dumps( - data, - ensure_ascii=not args.avoid_encoding_json, - indent=args.pretty_print_json - ) - print(f"\nWriting JSON file...({bytes_to_readable(len(data))})") - f.write(data) - else: - if args.json[-5:] == ".json": - args.json = args.json[:-5] - total = len(data.keys()) - if not os.path.isdir(args.json): - os.mkdir(args.json) - for index, jik in enumerate(data.keys()): - if data[jik]["name"] is not None: - contact = data[jik]["name"].replace('/', '') - else: - contact = jik.replace('+', '') - with open(f"{args.json}/{sanitize_filename(contact)}.json", "w") as f: - file_content_to_write = json.dumps({jik: data[jik]}, ensure_ascii=not args.avoid_encoding_json, indent=2 if args.pretty_print_json else None) - f.write(file_content_to_write) - print(f"Writing JSON file...({index + 1}/{total})", end="\r") - print() + elif args.exported: + # Process exported chat + process_exported_chat(args, data) else: - print() + # Process Android or iOS data + if args.android: + # Set default media path if not provided + if args.media is None: + args.media = "WhatsApp" + + # Set default DB paths if not provided + if args.db is None: + args.db = "msgstore.db" + if args.wa is None: + args.wa = "wa.db" + + # Decrypt backup if needed + if args.key is not None: + error = decrypt_android_backup(args) + if error != 0: + handle_decrypt_error(error) + elif args.ios: + # Set up identifiers based on business flag + if args.business: + from Whatsapp_Chat_Exporter.utility import WhatsAppBusinessIdentifier as identifiers + else: + from Whatsapp_Chat_Exporter.utility import WhatsAppIdentifier as identifiers + args.identifiers = identifiers + + # Set default media path if not provided + if args.media is None: + args.media = identifiers.DOMAIN + + # Extract media from backup if needed + if args.backup is not None: + if not os.path.isdir(args.media): + ios_media_handler.extract_media(args.backup, identifiers, args.decrypt_chunk_size) + else: + print("WhatsApp directory already exists, skipping WhatsApp file extraction.") + + # Set default DB paths if not provided + if args.db is None: + args.db = identifiers.MESSAGE + if args.wa is None: + args.wa = "ContactsV2.sqlite" + + # Process contacts + process_contacts(args, data, contact_store) + + # Process messages, media, and calls + process_messages(args, data) + + # Create output files + create_output_files(args, data, contact_store) + + # Handle media directory + handle_media_directory(args) - print("Everything is done!") - - -if __name__ == "__main__": - main() + print("Everything is done!") \ No newline at end of file diff --git a/Whatsapp_Chat_Exporter/android_crypt.py b/Whatsapp_Chat_Exporter/android_crypt.py new file mode 100644 index 0000000..84e629e --- /dev/null +++ b/Whatsapp_Chat_Exporter/android_crypt.py @@ -0,0 +1,328 @@ +import hmac +import io +import zlib +import concurrent.futures +from typing import Tuple, Union +from hashlib import sha256 +from sys import exit +from Whatsapp_Chat_Exporter.utility import CRYPT14_OFFSETS, Crypt, DbType + +try: + import zlib + from Crypto.Cipher import AES +except ModuleNotFoundError: + support_backup = False +else: + support_backup = True + +try: + import javaobj +except ModuleNotFoundError: + support_crypt15 = False +else: + support_crypt15 = True + + +class DecryptionError(Exception): + """Base class for decryption-related exceptions.""" + pass + + +class InvalidKeyError(DecryptionError): + """Raised when the provided key is invalid.""" + pass + + +class InvalidFileFormatError(DecryptionError): + """Raised when the input file format is invalid.""" + pass + + +class OffsetNotFoundError(DecryptionError): + """Raised when the correct offsets for decryption cannot be found.""" + pass + + +def _derive_main_enc_key(key_stream: bytes) -> Tuple[bytes, bytes]: + """ + Derive the main encryption key for the given key stream. + + Args: + key_stream (bytes): The key stream to generate HMAC of HMAC. + + Returns: + Tuple[bytes, bytes]: A tuple containing the main encryption key and the original key stream. + """ + intermediate_hmac = hmac.new(b'\x00' * 32, key_stream, sha256).digest() + key = hmac.new(intermediate_hmac, b"backup encryption\x01", sha256).digest() + return key, key_stream + + +def _extract_enc_key(keyfile: bytes) -> Tuple[bytes, bytes]: + """ + Extract the encryption key from the keyfile. + + Args: + keyfile (bytes): The keyfile containing the encrypted key. + + Returns: + Tuple[bytes, bytes]: values from _derive_main_enc_key() + """ + key_stream = b''.join([byte.to_bytes(1, "big", signed=True) for byte in javaobj.loads(keyfile)]) + return _derive_main_enc_key(key_stream) + + +def brute_force_offset(max_iv: int = 200, max_db: int = 200): + """ + Brute force the offsets for IV and database start position in WhatsApp backup files. + + Args: + max_iv (int, optional): Maximum value to try for IV offset. Defaults to 200. + max_db (int, optional): Maximum value to try for database start offset. Defaults to 200. + + Yields: + tuple: A tuple containing: + - int: Start position of IV + - int: End position of IV (start + 16) + - int: Start position of database + """ + for iv in range(0, max_iv): + for db in range(0, max_db): + yield iv, iv + 16, db + + +def _decrypt_database(db_ciphertext: bytes, main_key: bytes, iv: bytes) -> bytes: + """Decrypt and decompress a database chunk. + + Args: + db_ciphertext (bytes): The encrypted chunk of the database. + main_key (bytes): The main decryption key. + iv (bytes): The initialization vector. + + Returns: + bytes: The decrypted and decompressed database. + + Raises: + zlib.error: If decompression fails. + ValueError: if the plaintext is not a SQLite database. + """ + cipher = AES.new(main_key, AES.MODE_GCM, iv) + db_compressed = cipher.decrypt(db_ciphertext) + db = zlib.decompress(db_compressed) + if db[0:6].upper() != b"SQLITE": + raise ValueError( + "The plaintext is not a SQLite database. Ensure you are using the correct key." + ) + return db + +def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> bytes: + """Decrypt a crypt14 database using multithreading for brute-force offset detection. + + Args: + database (bytes): The encrypted database. + main_key (bytes): The decryption key. + max_worker (int, optional): The maximum number of threads to use for brute force. Defaults to 10. + + Returns: + bytes: The decrypted database. + + Raises: + InvalidFileFormatError: If the file is too small. + OffsetNotFoundError: If no valid offsets are found. + """ + if len(database) < 191: + raise InvalidFileFormatError("The crypt14 file must be at least 191 bytes") + + # Attempt known offsets first + for offsets in CRYPT14_OFFSETS: + iv = database[offsets["iv"]:offsets["iv"] + 16] + db_ciphertext = database[offsets["db"]:] + try: + return _decrypt_database(db_ciphertext, main_key, iv) + except (zlib.error, ValueError): + pass # Try next offset + + print("Common offsets failed. Initiating brute-force with multithreading...") + + # Convert brute force generator into a list for parallel processing + offset_combinations = list(brute_force_offset()) + + def attempt_decrypt(offset_tuple): + """Attempt decryption with the given offsets.""" + start_iv, end_iv, start_db = offset_tuple + iv = database[start_iv:end_iv] + db_ciphertext = database[start_db:] + + try: + db = _decrypt_database(db_ciphertext, main_key, iv) + print( + f"The offsets of your IV and database are {start_iv} and " + f"{start_db}, respectively. To include your offsets in the " + "program, please report it by creating an issue on GitHub: " + "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/discussions/47" + "\nShutting down other threads..." + ) + return db + except (zlib.error, ValueError): + return None # Decryption failed, move to next + + with concurrent.futures.ThreadPoolExecutor(max_worker) as executor: + future_to_offset = {executor.submit(attempt_decrypt, offset): offset for offset in offset_combinations} + + try: + for future in concurrent.futures.as_completed(future_to_offset): + result = future.result() + if result is not None: + # Shutdown remaining threads + executor.shutdown(wait=False, cancel_futures=True) + return result + + except KeyboardInterrupt: + print("\nBrute force interrupted by user (Ctrl+C). Exiting gracefully...") + executor.shutdown(wait=False, cancel_futures=True) + exit(1) + + raise OffsetNotFoundError("Could not find the correct offsets for decryption.") + + + +def _decrypt_crypt12(database: bytes, main_key: bytes) -> bytes: + """Decrypt a crypt12 database. + + Args: + database (bytes): The encrypted database. + main_key (bytes): The decryption key. + + Returns: + bytes: The decrypted database. + + Raises: + ValueError: If the file format is invalid or the signature mismatches. + """ + if len(database) < 67: + raise InvalidFileFormatError("The crypt12 file must be at least 67 bytes") + + t2 = database[3:35] + iv = database[51:67] + db_ciphertext = database[67:-20] + return _decrypt_database(db_ciphertext, main_key, iv) + + +def _decrypt_crypt15(database: bytes, main_key: bytes, db_type: DbType) -> bytes: + """Decrypt a crypt15 database. + + Args: + database (bytes): The encrypted database. + main_key (bytes): The decryption key. + db_type (DbType): The type of database. + + Returns: + bytes: The decrypted database. + + Raises: + ValueError: If the file format is invalid or the signature mismatches. + """ + if not support_crypt15: + raise RuntimeError("Crypt15 is not supported") + if len(database) < 131: + raise InvalidFileFormatError("The crypt15 file must be at least 131 bytes") + + if db_type == DbType.MESSAGE: + iv = database[8:24] + db_offset = database[0] + 2 + elif db_type == DbType.CONTACT: + iv = database[7:23] + db_offset = database[0] + 1 + else: + raise ValueError(f"Invalid db_type: {db_type}") + + db_ciphertext = database[db_offset:] + return _decrypt_database(db_ciphertext, main_key, iv) + + +def decrypt_backup( + database: bytes, + key: Union[str, io.IOBase], + output: str = None, + crypt: Crypt = Crypt.CRYPT14, + show_crypt15: bool = False, + db_type: DbType = DbType.MESSAGE, + *, + dry_run: bool = False, + keyfile_stream: bool = False, + max_worker: int = 10 +) -> int: + """ + Decrypt the WhatsApp backup database. + + Args: + database (bytes): The encrypted database file. + key (str or io.IOBase): The key to decrypt the database. + output (str, optional): The path to save the decrypted database. Defaults to None. + crypt (Crypt, optional): The encryption version of the database. Defaults to Crypt.CRYPT14. + show_crypt15 (bool, optional): Whether to show the HEX key of the crypt15 backup. Defaults to False. + db_type (DbType, optional): The type of database (MESSAGE or CONTACT). Defaults to DbType.MESSAGE. + dry_run (bool, optional): Whether to perform a dry run. Defaults to False. + keyfile_stream (bool, optional): Whether the key is a key stream. Defaults to False. + + Returns: + int: The status code of the decryption process (0 for success). + + Raises: + ValueError: If the key is invalid or output file not provided when dry_run is False. + DecryptionError: for errors during decryption + RuntimeError: for dependency errors + """ + if not support_backup: + raise RuntimeError("Dependencies for backup decryption are not available.") + + if not dry_run and output is None: + raise ValueError( + "The path to the decrypted database must be specified unless dry_run is true." + ) + + if isinstance(key, io.IOBase): + key = key.read() + + if crypt is not Crypt.CRYPT15 and len(key) != 158: + raise InvalidKeyError("The key file must be 158 bytes") + + #signature check, this is check is used in crypt 12 and 14 + if crypt != Crypt.CRYPT15: + t1 = key[30:62] + + if t1 != database[15:47] and crypt == Crypt.CRYPT14: + raise ValueError("The signature of key file and backup file mismatch") + + if t1 != database[3:35] and crypt == Crypt.CRYPT12: + raise ValueError("The signature of key file and backup file mismatch") + + + if crypt == Crypt.CRYPT15: + if keyfile_stream: + main_key, hex_key = _extract_enc_key(key) + else: + main_key, hex_key = _derive_main_enc_key(key) + if show_crypt15: + hex_key_str = ' '.join([hex_key.hex()[c:c+4] for c in range(0, len(hex_key.hex()), 4)]) + print(f"The HEX key of the crypt15 backup is: {hex_key_str}") + else: + main_key = key[126:] + + try: + if crypt == Crypt.CRYPT14: + db = _decrypt_crypt14(database, main_key, max_worker) + elif crypt == Crypt.CRYPT12: + db = _decrypt_crypt12(database, main_key) + elif crypt == Crypt.CRYPT15: + db = _decrypt_crypt15(database, main_key, db_type) + else: + raise ValueError(f"Unsupported crypt type: {crypt}") + except (InvalidFileFormatError, OffsetNotFoundError, ValueError) as e: + raise DecryptionError(f"Decryption failed: {e}") from e + + + if not dry_run: + with open(output, "wb") as f: + f.write(db) + return 0 diff --git a/Whatsapp_Chat_Exporter/android_handler.py b/Whatsapp_Chat_Exporter/android_handler.py index 0713232..5133d6c 100644 --- a/Whatsapp_Chat_Exporter/android_handler.py +++ b/Whatsapp_Chat_Exporter/android_handler.py @@ -2,195 +2,127 @@ import sqlite3 import os -import io -import hmac import shutil from pathlib import Path from mimetypes import MimeTypes from markupsafe import escape as htmle -from hashlib import sha256 from base64 import b64decode, b64encode from datetime import datetime from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, DbType, convert_time_unit, determine_metadata, get_cond_for_empty -from Whatsapp_Chat_Exporter.utility import rendering, Crypt, Device, get_file_name, setup_template -from Whatsapp_Chat_Exporter.utility import brute_force_offset, CRYPT14_OFFSETS, get_status_location -from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify, bytes_to_readable, JidType - -try: - import zlib - from Crypto.Cipher import AES -except ModuleNotFoundError: - support_backup = False -else: - support_backup = True -try: - import javaobj -except ModuleNotFoundError: - support_crypt15 = False -else: - support_crypt15 = True +from Whatsapp_Chat_Exporter.utility import CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device +from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty +from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata +from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify, bytes_to_readable -def _generate_hmac_of_hmac(key_stream): - key = hmac.new( - hmac.new( - b'\x00' * 32, - key_stream, - sha256 - ).digest(), - b"backup encryption\x01", - sha256 - ) - return key.digest(), key_stream - - -def _extract_encrypted_key(keyfile): - key_stream = b"" - for byte in javaobj.loads(keyfile): - key_stream += byte.to_bytes(1, "big", signed=True) - - return _generate_hmac_of_hmac(key_stream) - - -def decrypt_backup(database, key, output, crypt=Crypt.CRYPT14, show_crypt15=False, db_type=DbType.MESSAGE): - if not support_backup: - return 1 - if isinstance(key, io.IOBase): - key = key.read() - if crypt is not Crypt.CRYPT15: - t1 = key[30:62] - if crypt is not Crypt.CRYPT15 and len(key) != 158: - raise ValueError("The key file must be 158 bytes") - # Determine the IV and database offsets - if crypt == Crypt.CRYPT14: - if len(database) < 191: - raise ValueError("The crypt14 file must be at least 191 bytes") - current_try = 0 - offsets = CRYPT14_OFFSETS[current_try] - t2 = database[15:47] - iv = database[offsets["iv"]:offsets["iv"] + 16] - db_ciphertext = database[offsets["db"]:] - elif crypt == Crypt.CRYPT12: - if len(database) < 67: - raise ValueError("The crypt12 file must be at least 67 bytes") - t2 = database[3:35] - iv = database[51:67] - db_ciphertext = database[67:-20] - elif crypt == Crypt.CRYPT15: - if not support_crypt15: - return 1 - if len(database) < 131: - raise ValueError("The crypt15 file must be at least 131 bytes") - t1 = t2 = None - if db_type == DbType.MESSAGE: - iv = database[8:24] - db_offset = database[0] + 2 # Skip protobuf + protobuf size and backup type - elif db_type == DbType.CONTACT: - iv = database[7:23] - db_offset = database[0] + 1 # Skip protobuf + protobuf size - db_ciphertext = database[db_offset:] - - if t1 != t2: - raise ValueError("The signature of key file and backup file mismatch") - - if crypt == Crypt.CRYPT15: - if len(key) == 32: - main_key, hex_key = _generate_hmac_of_hmac(key) - else: - main_key, hex_key = _extract_encrypted_key(key) - if show_crypt15: - hex_key = [hex_key.hex()[c:c+4] for c in range(0, len(hex_key.hex()), 4)] - print("The HEX key of the crypt15 backup is: " + ' '.join(hex_key)) - else: - main_key = key[126:] - decompressed = False - while not decompressed: - cipher = AES.new(main_key, AES.MODE_GCM, iv) - db_compressed = cipher.decrypt(db_ciphertext) - try: - db = zlib.decompress(db_compressed) - except zlib.error: - if crypt == Crypt.CRYPT14: - current_try += 1 - if current_try < len(CRYPT14_OFFSETS): - offsets = CRYPT14_OFFSETS[current_try] - iv = database[offsets["iv"]:offsets["iv"] + 16] - db_ciphertext = database[offsets["db"]:] - continue - else: - print("Common offsets are not applicable to " - "your backup. Trying to brute force it...") - for start_iv, end_iv, start_db in brute_force_offset(): - iv = database[start_iv:end_iv] - db_ciphertext = database[start_db:] - cipher = AES.new(main_key, AES.MODE_GCM, iv) - db_compressed = cipher.decrypt(db_ciphertext) - try: - db = zlib.decompress(db_compressed) - except zlib.error: - continue - else: - decompressed = True - print( - f"The offsets of your IV and database are {start_iv} and " - f"{start_db}, respectively. To include your offsets in the " - "program, please report it by creating an issue on GitHub: " - "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/discussions/47" - ) - break - if not decompressed: - return 2 - else: - return 3 - else: - decompressed = True - if db[0:6].upper() == b"SQLITE": - with open(output, "wb") as f: - f.write(db) - return 0 - else: - raise ValueError("The plaintext is not a SQLite database. Did you use the key to encrypt something...") - - -def contacts(db, data): - # Get contacts +def contacts(db, data, enrich_from_vcards): + """ + Process WhatsApp contacts from the database. + + Args: + db: Database connection + data: Data store object + enrich_from_vcards: Path to vCard file for contact enrichment + + Returns: + bool: False if no contacts found, True otherwise + """ c = db.cursor() - c.execute("""SELECT count() FROM wa_contacts""") + c.execute("SELECT count() FROM wa_contacts") total_row_number = c.fetchone()[0] + if total_row_number == 0: - print("No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") + if enrich_from_vcards is not None: + print("No contacts profiles found in the default database, contacts will be imported from the specified vCard file.") + else: + print("No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") return False else: print(f"Processing contacts...({total_row_number})") - c.execute("""SELECT jid, COALESCE(display_name, wa_name) as display_name, status FROM wa_contacts; """) + c.execute("SELECT jid, COALESCE(display_name, wa_name) as display_name, status FROM wa_contacts;") row = c.fetchone() while row is not None: - data[row["jid"]] = ChatStore(Device.ANDROID, row["display_name"]) + current_chat = data.add_chat(row["jid"], ChatStore(Device.ANDROID, row["display_name"])) if row["status"] is not None: - data[row["jid"]].status = row["status"] + current_chat.status = row["status"] row = c.fetchone() + + return True def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, filter_empty): - # Get message history + """ + Process WhatsApp messages from the database. + + Args: + db: Database connection + data: Data store object + media_folder: Folder containing media files + timezone_offset: Timezone offset + filter_date: Date filter condition + filter_chat: Chat filter conditions + filter_empty: Filter for empty chats + """ c = db.cursor() + total_row_number = _get_message_count(c, filter_empty, filter_date, filter_chat) + print(f"Processing messages...(0/{total_row_number})", end="\r") + try: - c.execute(f"""SELECT count() + content_cursor = _get_messages_cursor_legacy(c, filter_empty, filter_date, filter_chat) + table_message = False + except sqlite3.OperationalError: + try: + content_cursor = _get_messages_cursor_new(c, filter_empty, filter_date, filter_chat) + table_message = True + except Exception as e: + raise e + + i = 0 + # Fetch the first row safely + content = _fetch_row_safely(content_cursor) + + while content is not None: + _process_single_message(data, content, table_message, timezone_offset) + + i += 1 + if i % 1000 == 0: + print(f"Processing messages...({i}/{total_row_number})", end="\r") + + # Fetch the next row safely + content = _fetch_row_safely(content_cursor) + + print(f"Processing messages...({total_row_number}/{total_row_number})", end="\r") + + +# Helper functions for message processing + +def _get_message_count(cursor, filter_empty, filter_date, filter_chat): + """Get the total number of messages to process.""" + try: + empty_filter = get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push") + date_filter = f'AND timestamp {filter_date}' if filter_date is not None else '' + include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") + exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") + + cursor.execute(f"""SELECT count() FROM messages INNER JOIN jid ON messages.key_remote_jid = jid.raw_string LEFT JOIN chat ON chat.jid_row_id = jid._id WHERE 1=1 - {get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push")} - {f'AND timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android")} - {get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android")}""") - + {empty_filter} + {date_filter} + {include_filter} + {exclude_filter}""") except sqlite3.OperationalError: - c.execute(f"""SELECT count() + empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast") + date_filter = f'AND timestamp {filter_date}' if filter_date is not None else '' + include_filter = get_chat_condition(filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition(filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + + cursor.execute(f"""SELECT count() FROM message LEFT JOIN chat ON chat._id = message.chat_row_id @@ -199,15 +131,21 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, LEFT JOIN jid jid_group ON jid_group._id = message.sender_jid_row_id WHERE 1=1 - {get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast")} - {f'AND timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android")} - {get_chat_condition(filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android")}""") - total_row_number = c.fetchone()[0] - print(f"Processing messages...(0/{total_row_number})", end="\r") + {empty_filter} + {date_filter} + {include_filter} + {exclude_filter}""") + return cursor.fetchone()[0] - try: - c.execute(f"""SELECT messages.key_remote_jid, + +def _get_messages_cursor_legacy(cursor, filter_empty, filter_date, filter_chat): + """Get cursor for legacy database schema.""" + empty_filter = get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push") + date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' + include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") + exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") + + cursor.execute(f"""SELECT messages.key_remote_jid, messages._id, messages.key_from_me, messages.timestamp, @@ -230,16 +168,13 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, jid_old.raw_string as old_jid, jid_new.raw_string as new_jid, jid_global.type as jid_type, - group_concat(receipt_user.receipt_timestamp) as receipt_timestamp, - group_concat(messages.received_timestamp) as received_timestamp, - group_concat(receipt_user.read_timestamp) as read_timestamp, - group_concat(receipt_user.played_timestamp) as played_timestamp, - group_concat(messages.read_device_timestamp) as read_device_timestamp + COALESCE(receipt_user.receipt_timestamp, messages.received_timestamp) as received_timestamp, + COALESCE(receipt_user.read_timestamp, receipt_user.played_timestamp, messages.read_device_timestamp) as read_timestamp FROM messages LEFT JOIN messages_quotes ON messages.quoted_row_id = messages_quotes._id - LEFT JOIN missed_call_logs - ON messages._id = missed_call_logs.message_row_id + LEFT JOIN missed_call_logs + ON messages._id = missed_call_logs.message_row_id INNER JOIN jid jid_global ON messages.key_remote_jid = jid_global.raw_string LEFT JOIN chat @@ -257,16 +192,23 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, LEFT JOIN receipt_user ON receipt_user.message_row_id = messages._id WHERE messages.key_remote_jid <> '-1' - {get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push")} - {f'AND messages.timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android")} - {get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android")} + {empty_filter} + {date_filter} + {include_filter} + {exclude_filter} GROUP BY messages._id - ORDER BY messages.timestamp ASC;""" - ) - except sqlite3.OperationalError: - try: - c.execute(f"""SELECT jid_global.raw_string as key_remote_jid, + ORDER BY messages.timestamp ASC;""") + return cursor + + +def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat): + """Get cursor for new database schema.""" + empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") + date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' + include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") + exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") + + cursor.execute(f"""SELECT jid_global.raw_string as key_remote_jid, message._id, message.from_me as key_from_me, message.timestamp, @@ -283,17 +225,15 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, message.message_type as media_wa_type, jid_group.raw_string as group_sender_jid, chat.subject as chat_subject, - missed_call_logs.video_call, + missed_call_logs.video_call, message.sender_jid_row_id, message_system.action_type, message_system_group.is_me_joined, jid_old.raw_string as old_jid, jid_new.raw_string as new_jid, jid_global.type as jid_type, - group_concat(receipt_user.receipt_timestamp) as receipt_timestamp, - group_concat(message.received_timestamp) as received_timestamp, - group_concat(receipt_user.read_timestamp) as read_timestamp, - group_concat(receipt_user.played_timestamp) as played_timestamp + COALESCE(receipt_user.receipt_timestamp, message.received_timestamp) as received_timestamp, + COALESCE(receipt_user.read_timestamp, receipt_user.played_timestamp) as read_timestamp FROM message LEFT JOIN message_quoted ON message_quoted.message_row_id = message._id @@ -311,8 +251,8 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, ON jid_global._id = chat.jid_row_id LEFT JOIN jid jid_group ON jid_group._id = message.sender_jid_row_id - LEFT JOIN missed_call_logs - ON message._id = missed_call_logs.message_row_id + LEFT JOIN missed_call_logs + ON message._id = missed_call_logs.message_row_id LEFT JOIN message_system ON message_system.message_row_id = message._id LEFT JOIN message_system_group @@ -326,180 +266,259 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, LEFT JOIN receipt_user ON receipt_user.message_row_id = message._id WHERE key_remote_jid <> '-1' - {get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast")} - {f'AND message.timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android")} - {get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android")} - GROUP BY message._id;""" - ) - except Exception as e: - raise e - else: - table_message = True - else: - table_message = False - i = 0 + {empty_filter} + {date_filter} + {include_filter} + {exclude_filter} + GROUP BY message._id;""") + return cursor + + +def _fetch_row_safely(cursor): + """Safely fetch a row from cursor, handling operational errors.""" while True: try: - content = c.fetchone() + content = cursor.fetchone() + return content except sqlite3.OperationalError: continue - else: - break - while content is not None: - if content["key_remote_jid"] not in data: - data[content["key_remote_jid"]] = ChatStore(Device.ANDROID, content["chat_subject"]) - if content["key_remote_jid"] is None: - continue # Not sure - if "sender_jid_row_id" in content: - sender_jid_row_id = content["sender_jid_row_id"] - else: - sender_jid_row_id = None - message = Message( - from_me=not sender_jid_row_id and content["key_from_me"], - timestamp=content["timestamp"], - time=content["timestamp"], - key_id=content["key_id"], - timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET, - message_type=content["media_wa_type"] - ) - if isinstance(content["data"], bytes): - message.data = ("The message is binary data and its base64 is " - '""") - message.data += b64encode(content["data"]).decode("utf-8") + "" - message.safe = message.meta = True - data[content["key_remote_jid"]].add_message(content["_id"], message) - i += 1 - content = c.fetchone() - continue - if content["jid_type"] == JidType.GROUP and content["key_from_me"] == 0: - name = fallback = None - if table_message: - if content["sender_jid_row_id"] > 0: - _jid = content["group_sender_jid"] - if _jid in data: - name = data[_jid].name - if "@" in _jid: - fallback = _jid.split('@')[0] - else: - if content["remote_resource"] is not None: - if content["remote_resource"] in data: - name = data[content["remote_resource"]].name - if "@" in content["remote_resource"]: - fallback = content["remote_resource"].split('@')[0] - message.sender = name or fallback - else: - message.sender = None - if content["quoted"] is not None: - message.reply = content["quoted"] - if content["quoted_data"] is not None and len(content["quoted_data"]) > 200: - message.quoted_data = content["quoted_data"][:201] + "..." - else: - message.quoted_data = content["quoted_data"] +def _process_single_message(data, content, table_message, timezone_offset): + """Process a single message row.""" + if content["key_remote_jid"] is None: + return + + # Get or create the chat + if not data.get_chat(content["key_remote_jid"]): + current_chat = data.add_chat(content["key_remote_jid"], ChatStore(Device.ANDROID, content["chat_subject"])) + else: + current_chat = data.get_chat(content["key_remote_jid"]) + + # Determine sender_jid_row_id + if "sender_jid_row_id" in content: + sender_jid_row_id = content["sender_jid_row_id"] + else: + sender_jid_row_id = None + + # Create message object + message = Message( + from_me=not sender_jid_row_id and content["key_from_me"], + timestamp=content["timestamp"], + time=content["timestamp"], + key_id=content["key_id"], + timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET, + message_type=content["media_wa_type"], + received_timestamp=content["received_timestamp"], + read_timestamp=content["read_timestamp"] + ) + + # Handle binary data + if isinstance(content["data"], bytes): + _process_binary_message(message, content) + current_chat.add_message(content["_id"], message) + return + + # Set sender for group chats + if content["jid_type"] == JidType.GROUP and content["key_from_me"] == 0: + _set_group_sender(message, content, data, table_message) + else: + message.sender = None + + # Handle quoted messages + if content["quoted"] is not None: + message.reply = content["quoted"] + if content["quoted_data"] is not None and len(content["quoted_data"]) > 200: + message.quoted_data = content["quoted_data"][:201] + "..." else: - message.reply = None + message.quoted_data = content["quoted_data"] + else: + message.reply = None + + # Handle message caption + if not table_message and content["media_caption"] is not None: + # Old schema + message.caption = content["media_caption"] + elif table_message and content["media_wa_type"] == 1 and content["data"] is not None: + # New schema + message.caption = content["data"] + else: + message.caption = None + + # Handle message content based on status + if content["status"] == 6: # 6 = Metadata + _process_metadata_message(message, content, data, table_message) + else: + # Real message + _process_regular_message(message, content, table_message) + + current_chat.add_message(content["_id"], message) - if not table_message and content["media_caption"] is not None: - # Old schema - message.caption = content["media_caption"] - elif table_message and content["media_wa_type"] == 1 and content["data"] is not None: - # New schema - message.caption = content["data"] + +def _process_binary_message(message, content): + """Process binary message data.""" + message.data = ("The message is binary data and its base64 is " + '""") + message.data += b64encode(content["data"]).decode("utf-8") + "" + message.safe = message.meta = True + + +def _set_group_sender(message, content, data, table_message): + """Set sender name for group messages.""" + name = fallback = None + if table_message: + if content["sender_jid_row_id"] > 0: + _jid = content["group_sender_jid"] + if _jid in data: + name = data.get_chat(_jid).name + if "@" in _jid: + fallback = _jid.split('@')[0] + else: + if content["remote_resource"] is not None: + if content["remote_resource"] in data: + name = data.get_chat(content["remote_resource"]).name + if "@" in content["remote_resource"]: + fallback = content["remote_resource"].split('@')[0] + + message.sender = name or fallback + + +def _process_metadata_message(message, content, data, table_message): + """Process metadata message.""" + message.meta = True + name = fallback = None + + if table_message: + if content["sender_jid_row_id"] > 0: + _jid = content["group_sender_jid"] + if _jid in data: + name = data.get_chat(_jid).name + if "@" in _jid: + fallback = _jid.split('@')[0] else: - message.caption = None - - if content["status"] == 6: # 6 = Metadata, otherwise assume a message + name = "You" + else: + _jid = content["remote_resource"] + if _jid is not None: + if _jid in data: + name = data.get_chat(_jid).name + if "@" in _jid: + fallback = _jid.split('@')[0] + else: + name = "You" + + message.data = determine_metadata(content, name or fallback) + + if isinstance(message.data, str) and "
" in message.data: + message.safe = True + + if message.data is None: + if content["video_call"] is not None: # Missed call message.meta = True - name = fallback = None - if table_message: - if content["sender_jid_row_id"] > 0: - _jid = content["group_sender_jid"] - if _jid in data: - name = data[_jid].name - if "@" in _jid: - fallback = _jid.split('@')[0] - else: - name = "You" - else: - _jid = content["remote_resource"] - if _jid is not None: - if _jid in data: - name = data[_jid].name - if "@" in _jid: - fallback = _jid.split('@')[0] - else: - name = "You" - message.data = determine_metadata(content, name or fallback) - if isinstance(message.data, str) and "
" in message.data: - message.safe = True - if message.data is None: - if content["video_call"] is not None: # Missed call - message.meta = True - if content["video_call"] == 1: - message.data = "A video call was missed" - elif content["video_call"] == 0: - message.data = "A voice call was missed" - elif content["data"] is None and content["thumb_image"] is None: - message.meta = True - message.data = None - else: - # Real message - message.sticker = content["media_wa_type"] == 20 # Sticker is a message - if content["key_from_me"] == 1: - if content["status"] == 5 and content["edit_version"] == 7 or table_message and content["media_wa_type"] == 15: - msg = "Message deleted" - message.meta = True - else: - if content["media_wa_type"] == 5: - msg = f"Location shared: {content['latitude'], content['longitude']}" - message.meta = True - else: - msg = content["data"] - if msg is not None: - if "\r\n" in msg: - msg = msg.replace("\r\n", "
") - if "\n" in msg: - msg = msg.replace("\n", "
") - else: - if content["status"] == 0 and content["edit_version"] == 7 or table_message and content["media_wa_type"] == 15: - msg = "Message deleted" - message.meta = True - else: - if content["media_wa_type"] == 5: - msg = f"Location shared: {content['latitude'], content['longitude']}" - message.meta = True - else: - msg = content["data"] - if msg is not None: - if "\r\n" in msg: - msg = msg.replace("\r\n", "
") - if "\n" in msg: - msg = msg.replace("\n", "
") - message.data = msg + if content["video_call"] == 1: + message.data = "A video call was missed" + elif content["video_call"] == 0: + message.data = "A voice call was missed" + elif content["data"] is None and content["thumb_image"] is None: + message.meta = True + message.data = None - data[content["key_remote_jid"]].add_message(content["_id"], message) - i += 1 - if i % 1000 == 0: - print(f"Processing messages...({i}/{total_row_number})", end="\r") - while True: - try: - content = c.fetchone() - except sqlite3.OperationalError: - continue + +def _process_regular_message(message, content, table_message): + """Process regular (non-metadata) message.""" + message.sticker = content["media_wa_type"] == 20 # Sticker is a message + + if content["key_from_me"] == 1: + if content["status"] == 5 and content["edit_version"] == 7 or table_message and content["media_wa_type"] == 15: + msg = "Message deleted" + message.meta = True + else: + if content["media_wa_type"] == 5: + msg = f"Location shared: {content['latitude'], content['longitude']}" + message.meta = True else: - break - print(f"Processing messages...({total_row_number}/{total_row_number})", end="\r") + msg = content["data"] + if msg is not None: + msg = _format_message_text(msg) + else: + if content["status"] == 0 and content["edit_version"] == 7 or table_message and content["media_wa_type"] == 15: + msg = "Message deleted" + message.meta = True + else: + if content["media_wa_type"] == 5: + msg = f"Location shared: {content['latitude'], content['longitude']}" + message.meta = True + else: + msg = content["data"] + if msg is not None: + msg = _format_message_text(msg) + + message.data = msg + + +def _format_message_text(text): + """Format message text, replacing newlines with HTML breaks.""" + if "\r\n" in text: + text = text.replace("\r\n", "
") + if "\n" in text: + text = text.replace("\n", "
") + return text def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=True): - # Get media + """ + Process WhatsApp media files from the database. + + Args: + db: Database connection + data: Data store object + media_folder: Folder containing media files + filter_date: Date filter condition + filter_chat: Chat filter conditions + filter_empty: Filter for empty chats + separate_media: Whether to separate media files by chat + """ c = db.cursor() + total_row_number = _get_media_count(c, filter_empty, filter_date, filter_chat) + print(f"\nProcessing media...(0/{total_row_number})", end="\r") + try: - c.execute(f"""SELECT count() + content_cursor = _get_media_cursor_legacy(c, filter_empty, filter_date, filter_chat) + except sqlite3.OperationalError: + content_cursor = _get_media_cursor_new(c, filter_empty, filter_date, filter_chat) + + content = content_cursor.fetchone() + mime = MimeTypes() + + # Ensure thumbnails directory exists + Path(f"{media_folder}/thumbnails").mkdir(parents=True, exist_ok=True) + + i = 0 + while content is not None: + _process_single_media(data, content, media_folder, mime, separate_media) + + i += 1 + if i % 100 == 0: + print(f"Processing media...({i}/{total_row_number})", end="\r") + + content = content_cursor.fetchone() + + print(f"Processing media...({total_row_number}/{total_row_number})", end="\r") + + +# Helper functions for media processing + +def _get_media_count(cursor, filter_empty, filter_date, filter_chat): + """Get the total number of media files to process.""" + try: + empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push") + date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' + include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + + cursor.execute(f"""SELECT count() FROM message_media INNER JOIN messages ON message_media.message_row_id = messages._id @@ -508,12 +527,17 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa LEFT JOIN chat ON chat.jid_row_id = jid._id WHERE 1=1 - {get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push")} - {f'AND messages.timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android")} - {get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android")}""") + {empty_filter} + {date_filter} + {include_filter} + {exclude_filter}""") except sqlite3.OperationalError: - c.execute(f"""SELECT count() + empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast") + date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' + include_filter = get_chat_condition(filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition(filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + + cursor.execute(f"""SELECT count() FROM message_media INNER JOIN message ON message_media.message_row_id = message._id @@ -524,40 +548,54 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa LEFT JOIN jid jid_group ON jid_group._id = message.sender_jid_row_id WHERE 1=1 - {get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast")} - {f'AND message.timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android")} - {get_chat_condition(filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android")}""") - total_row_number = c.fetchone()[0] - print(f"\nProcessing media...(0/{total_row_number})", end="\r") - i = 0 - try: - c.execute(f"""SELECT messages.key_remote_jid, + {empty_filter} + {date_filter} + {include_filter} + {exclude_filter}""") + return cursor.fetchone()[0] + + +def _get_media_cursor_legacy(cursor, filter_empty, filter_date, filter_chat): + """Get cursor for legacy media database schema.""" + empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") + date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' + include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + + cursor.execute(f"""SELECT messages.key_remote_jid, message_row_id, file_path, message_url, mime_type, media_key, file_hash, - thumbnail + thumbnail FROM message_media INNER JOIN messages ON message_media.message_row_id = messages._id - LEFT JOIN media_hash_thumbnail - ON message_media.file_hash = media_hash_thumbnail.media_hash + LEFT JOIN media_hash_thumbnail + ON message_media.file_hash = media_hash_thumbnail.media_hash INNER JOIN jid ON messages.key_remote_jid = jid.raw_string LEFT JOIN chat ON chat.jid_row_id = jid._id WHERE jid.type <> 7 - {get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast")} - {f'AND messages.timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android")} - {get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android")} - ORDER BY messages.key_remote_jid ASC""" - ) - except sqlite3.OperationalError: - c.execute(f"""SELECT jid.raw_string as key_remote_jid, + {empty_filter} + {date_filter} + {include_filter} + {exclude_filter} + ORDER BY messages.key_remote_jid ASC""") + return cursor + + +def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat): + """Get cursor for new media database schema.""" + empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") + date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' + include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + + cursor.execute(f"""SELECT jid.raw_string as key_remote_jid, message_row_id, file_path, message_url, @@ -573,202 +611,295 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa INNER JOIN jid ON jid._id = chat.jid_row_id LEFT JOIN media_hash_thumbnail - ON message_media.file_hash = media_hash_thumbnail.media_hash + ON message_media.file_hash = media_hash_thumbnail.media_hash LEFT JOIN jid jid_group ON jid_group._id = message.sender_jid_row_id WHERE jid.type <> 7 - {get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast")} - {f'AND message.timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android")} - {get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android")} - ORDER BY jid.raw_string ASC""" - ) - content = c.fetchone() - mime = MimeTypes() - if not os.path.isdir(f"{media_folder}/thumbnails"): - Path(f"{media_folder}/thumbnails").mkdir(parents=True, exist_ok=True) - while content is not None: - file_path = f"{media_folder}/{content['file_path']}" - message = data[content["key_remote_jid"]].messages[content["message_row_id"]] - message.media = True - if os.path.isfile(file_path): - message.data = file_path - if content["mime_type"] is None: - guess = mime.guess_type(file_path)[0] - if guess is not None: - message.mime = guess - else: - message.mime = "application/octet-stream" + {empty_filter} + {date_filter} + {include_filter} + {exclude_filter} + ORDER BY jid.raw_string ASC""") + return cursor + + +def _process_single_media(data, content, media_folder, mime, separate_media): + """Process a single media file.""" + file_path = f"{media_folder}/{content['file_path']}" + current_chat = data.get_chat(content["key_remote_jid"]) + message = current_chat.get_message(content["message_row_id"]) + message.media = True + + if os.path.isfile(file_path): + message.data = file_path + + # Set mime type + if content["mime_type"] is None: + guess = mime.guess_type(file_path)[0] + if guess is not None: + message.mime = guess else: - message.mime = content["mime_type"] - if separate_media: - chat_display_name = slugify(data[content["key_remote_jid"]].name or message.sender \ - or content["key_remote_jid"].split('@')[0], True) - current_filename = file_path.split("/")[-1] - new_folder = os.path.join(media_folder, "separated", chat_display_name) - Path(new_folder).mkdir(parents=True, exist_ok=True) - new_path = os.path.join(new_folder, current_filename) - shutil.copy2(file_path, new_path) - message.data = new_path + message.mime = "application/octet-stream" else: - message.data = "The media is missing" - message.mime = "media" - message.meta = True - if content["thumbnail"] is not None: - thumb_path = f"{media_folder}/thumbnails/{b64decode(content['file_hash']).hex()}.png" - if not os.path.isfile(thumb_path): - with open(thumb_path, "wb") as f: - f.write(content["thumbnail"]) - message.thumb = thumb_path - i += 1 - if i % 100 == 0: - print(f"Processing media...({i}/{total_row_number})", end="\r") - content = c.fetchone() - print( - f"Processing media...({total_row_number}/{total_row_number})", end="\r") + message.mime = content["mime_type"] + + # Copy media to separate folder if needed + if separate_media: + chat_display_name = slugify(current_chat.name or message.sender + or content["key_remote_jid"].split('@')[0], True) + current_filename = file_path.split("/")[-1] + new_folder = os.path.join(media_folder, "separated", chat_display_name) + Path(new_folder).mkdir(parents=True, exist_ok=True) + new_path = os.path.join(new_folder, current_filename) + shutil.copy2(file_path, new_path) + message.data = new_path + else: + message.data = "The media is missing" + message.mime = "media" + message.meta = True + + # Handle thumbnail + if content["thumbnail"] is not None: + thumb_path = f"{media_folder}/thumbnails/{b64decode(content['file_hash']).hex()}.png" + if not os.path.isfile(thumb_path): + with open(thumb_path, "wb") as f: + f.write(content["thumbnail"]) + message.thumb = thumb_path def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): + """Process vCard data from WhatsApp database and save to files.""" c = db.cursor() try: - c.execute(f"""SELECT message_row_id, - messages.key_remote_jid, - vcard, - messages.media_name - FROM messages_vcards - INNER JOIN messages - ON messages_vcards.message_row_id = messages._id - INNER JOIN jid - ON messages.key_remote_jid = jid.raw_string - LEFT JOIN chat - ON chat.jid_row_id = jid._id - WHERE 1=1 - {get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push")} - {f'AND messages.timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android")} - {get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android")} - ORDER BY messages.key_remote_jid ASC;""" - ) + rows = _execute_vcard_query_modern(c, filter_date, filter_chat, filter_empty) except sqlite3.OperationalError: - c.execute(f"""SELECT message_row_id, - jid.raw_string as key_remote_jid, - vcard, - message.text_data as media_name - FROM message_vcard - INNER JOIN message - ON message_vcard.message_row_id = message._id - LEFT JOIN chat - ON chat._id = message.chat_row_id - INNER JOIN jid - ON jid._id = chat.jid_row_id - LEFT JOIN jid jid_group - ON jid_group._id = message.sender_jid_row_id - WHERE 1=1 - {get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast")} - {f'AND message.timestamp {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android")} - {get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android")} - ORDER BY message.chat_row_id ASC;""" - ) + rows = _execute_vcard_query_legacy(c, filter_date, filter_chat, filter_empty) - rows = c.fetchall() total_row_number = len(rows) print(f"\nProcessing vCards...(0/{total_row_number})", end="\r") - path = f"{media_folder}/vCards" - if not os.path.isdir(path): - Path(path).mkdir(parents=True, exist_ok=True) + + # Create vCards directory if it doesn't exist + path = os.path.join(media_folder, "vCards") + Path(path).mkdir(parents=True, exist_ok=True) + for index, row in enumerate(rows): - media_name = row["media_name"] if row["media_name"] is not None else "Undefined vCard File" - file_name = "".join(x for x in media_name if x.isalnum()) - file_name = file_name.encode('utf-8')[:230].decode('utf-8', 'ignore') - file_path = os.path.join(path, f"{file_name}.vcf") - if not os.path.isfile(file_path): - with open(file_path, "w", encoding="utf-8") as f: - f.write(row["vcard"]) - message = data[row["key_remote_jid"]].messages[row["message_row_id"]] - message.data = "This media include the following vCard file(s):
" \ - f'{htmle(media_name)}' - message.mime = "text/x-vcard" - message.meta = True - message.safe = True + _process_vcard_row(row, path, data) print(f"Processing vCards...({index + 1}/{total_row_number})", end="\r") +def _execute_vcard_query_modern(c, filter_date, filter_chat, filter_empty): + """Execute vCard query for modern WhatsApp database schema.""" + + # Build the filter conditions + chat_filter_include = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' + empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push") + + query = f"""SELECT message_row_id, + messages.key_remote_jid, + vcard, + messages.media_name + FROM messages_vcards + INNER JOIN messages + ON messages_vcards.message_row_id = messages._id + INNER JOIN jid + ON messages.key_remote_jid = jid.raw_string + LEFT JOIN chat + ON chat.jid_row_id = jid._id + WHERE 1=1 + {empty_filter} + {date_filter} + {chat_filter_include} + {chat_filter_exclude} + ORDER BY messages.key_remote_jid ASC;""" + c.execute(query) + return c.fetchall() + + +def _execute_vcard_query_legacy(c, filter_date, filter_chat, filter_empty): + """Execute vCard query for legacy WhatsApp database schema.""" + + # Build the filter conditions + chat_filter_include = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' + empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") + + query = f"""SELECT message_row_id, + jid.raw_string as key_remote_jid, + vcard, + message.text_data as media_name + FROM message_vcard + INNER JOIN message + ON message_vcard.message_row_id = message._id + LEFT JOIN chat + ON chat._id = message.chat_row_id + INNER JOIN jid + ON jid._id = chat.jid_row_id + LEFT JOIN jid jid_group + ON jid_group._id = message.sender_jid_row_id + WHERE 1=1 + {empty_filter} + {date_filter} + {chat_filter_include} + {chat_filter_exclude} + ORDER BY message.chat_row_id ASC;""" + c.execute(query) + return c.fetchall() + + +def _process_vcard_row(row, path, data): + """Process a single vCard row and save to file.""" + media_name = row["media_name"] if row["media_name"] is not None else "Undefined vCard File" + file_name = "".join(x for x in media_name if x.isalnum()) + file_name = file_name.encode('utf-8')[:230].decode('utf-8', 'ignore') + file_path = os.path.join(path, f"{file_name}.vcf") + + if not os.path.isfile(file_path): + with open(file_path, "w", encoding="utf-8") as f: + f.write(row["vcard"]) + + message = data.get_chat(row["key_remote_jid"]).get_message(row["message_row_id"]) + message.data = "This media include the following vCard file(s):
" \ + f'{htmle(media_name)}' + message.mime = "text/x-vcard" + message.meta = True + message.safe = True + + def calls(db, data, timezone_offset, filter_chat): + """Process call logs from WhatsApp database.""" c = db.cursor() - c.execute(f"""SELECT count() - FROM call_log - INNER JOIN jid - ON call_log.jid_row_id = jid._id - LEFT JOIN chat - ON call_log.jid_row_id = chat.jid_row_id - WHERE 1=1 - {get_chat_condition(filter_chat[0], True, ["jid.raw_string"])} - {get_chat_condition(filter_chat[1], False, ["jid.raw_string"])}""") - total_row_number = c.fetchone()[0] + + # Check if there are any calls that match the filter + total_row_number = _get_calls_count(c, filter_chat) if total_row_number == 0: return + print(f"\nProcessing calls...({total_row_number})", end="\r") - c.execute(f"""SELECT call_log._id, - jid.raw_string, - from_me, - call_id, - timestamp, - video_call, - duration, - call_result, - bytes_transferred, - chat.subject as chat_subject - FROM call_log - INNER JOIN jid - ON call_log.jid_row_id = jid._id - LEFT JOIN chat - ON call_log.jid_row_id = chat.jid_row_id - WHERE 1=1 - {get_chat_condition(filter_chat[0], True, ["jid.raw_string"])} - {get_chat_condition(filter_chat[1], False, ["jid.raw_string"])}""" - ) + + # Fetch call data + calls_data = _fetch_calls_data(c, filter_chat) + + # Create a chat store for all calls chat = ChatStore(Device.ANDROID, "WhatsApp Calls") - content = c.fetchone() + + # Process each call + content = calls_data.fetchone() while content is not None: - call = Message( - from_me=content["from_me"], - timestamp=content["timestamp"], - time=content["timestamp"], - key_id=content["call_id"], - timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET + _process_call_record(content, chat, data, timezone_offset) + content = calls_data.fetchone() + + # Add the calls chat to the data + data.add_chat("000000000000000", chat) + + +def _get_calls_count(c, filter_chat): + """Get the count of call records that match the filter.""" + + # Build the filter conditions + chat_filter_include = get_chat_condition(filter_chat[0], True, ["jid.raw_string"]) + chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["jid.raw_string"]) + + query = f"""SELECT count() + FROM call_log + INNER JOIN jid + ON call_log.jid_row_id = jid._id + LEFT JOIN chat + ON call_log.jid_row_id = chat.jid_row_id + WHERE 1=1 + {chat_filter_include} + {chat_filter_exclude}""" + c.execute(query) + return c.fetchone()[0] + + +def _fetch_calls_data(c, filter_chat): + """Fetch call data from the database.""" + + # Build the filter conditions + chat_filter_include = get_chat_condition(filter_chat[0], True, ["jid.raw_string"]) + chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["jid.raw_string"]) + + query = f"""SELECT call_log._id, + jid.raw_string, + from_me, + call_id, + timestamp, + video_call, + duration, + call_result, + bytes_transferred, + chat.subject as chat_subject + FROM call_log + INNER JOIN jid + ON call_log.jid_row_id = jid._id + LEFT JOIN chat + ON call_log.jid_row_id = chat.jid_row_id + WHERE 1=1 + {chat_filter_include} + {chat_filter_exclude}""" + c.execute(query) + return c + + +def _process_call_record(content, chat, data, timezone_offset): + """Process a single call record and add it to the chat.""" + call = Message( + from_me=content["from_me"], + timestamp=content["timestamp"], + time=content["timestamp"], + key_id=content["call_id"], + timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET, + received_timestamp=None, # TODO: Add timestamp + read_timestamp=None # TODO: Add timestamp + ) + + # Get caller/callee name + _jid = content["raw_string"] + name = data.get_chat(_jid).name if _jid in data else content["chat_subject"] or None + if _jid is not None and "@" in _jid: + fallback = _jid.split('@')[0] + else: + fallback = None + call.sender = name or fallback + + # Set metadata + call.meta = True + + # Construct call description based on call type and result + call.data = _construct_call_description(content, call) + + # Add call to chat + chat.add_message(content["_id"], call) + + +def _construct_call_description(content, call): + """Construct a description of the call based on its type and result.""" + description = ( + f"A {'video' if content['video_call'] else 'voice'} " + f"call {'to' if call.from_me else 'from'} " + f"{call.sender} was " + ) + + if content['call_result'] in (0, 4, 7): + description += "cancelled." if call.from_me else "missed." + elif content['call_result'] == 2: + description += "not answered." if call.from_me else "missed." + elif content['call_result'] == 3: + description += "unavailable." + elif content['call_result'] == 5: + call_time = convert_time_unit(content['duration']) + call_bytes = bytes_to_readable(content['bytes_transferred']) + description += ( + f"initiated and lasted for {call_time} " + f"with {call_bytes} data transferred." ) - _jid = content["raw_string"] - name = data[_jid].name if _jid in data else content["chat_subject"] or None - if _jid is not None and "@" in _jid: - fallback = _jid.split('@')[0] - else: - fallback = None - call.sender = name or fallback - call.meta = True - call.data = ( - f"A {'video' if content['video_call'] else 'voice'} " - f"call {'to' if call.from_me else 'from'} " - f"{call.sender} was " - ) - if content['call_result'] in (0, 4, 7): - call.data += "cancelled." if call.from_me else "missed." - elif content['call_result'] == 2: - call.data += "not answered." if call.from_me else "missed." - elif content['call_result'] == 3: - call.data += "unavailable." - elif content['call_result'] == 5: - call_time = convert_time_unit(content['duration']) - call_bytes = bytes_to_readable(content['bytes_transferred']) - call.data += ( - f"initiated and lasted for {call_time} " - f"with {call_bytes} data transferred." - ) - else: - call.data += "in an unknown state." - chat.add_message(content["_id"], call) - content = c.fetchone() - data["000000000000000"] = chat + else: + description += "in an unknown state." + + return description def create_html( @@ -782,117 +913,185 @@ def create_html( experimental=False, headline=None ): + """Generate HTML chat files from data.""" template = setup_template(template, no_avatar, experimental) total_row_number = len(data) print(f"\nGenerating chats...(0/{total_row_number})", end="\r") + # Create output directory if it doesn't exist if not os.path.isdir(output_folder): os.mkdir(output_folder) w3css = get_status_location(output_folder, offline_static) for current, contact in enumerate(data): - chat = data[contact] - safe_file_name, name = get_file_name(contact, chat) + current_chat = data.get_chat(contact) + if len(current_chat) == 0: + # Skip empty chats + continue + + safe_file_name, name = get_file_name(contact, current_chat) if maximum_size is not None: - current_size = 0 - current_page = 1 - render_box = [] - if maximum_size == 0: - maximum_size = MAX_SIZE - last_msg = chat.get_last_message().key_id - for message in chat.get_messages(): - if message.data is not None and not message.meta and not message.media: - current_size += len(message.data) + ROW_SIZE - else: - current_size += ROW_SIZE + 100 # Assume media and meta HTML are 100 bytes - if current_size > maximum_size: - output_file_name = f"{output_folder}/{safe_file_name}-{current_page}.html" - rendering( - output_file_name, - template, - name, - render_box, - contact, - w3css, - chat, - headline, - next=f"{safe_file_name}-{current_page + 1}.html", - previous=f"{safe_file_name}-{current_page - 1}.html" if current_page > 1 else False - ) - render_box = [message] - current_size = 0 - current_page += 1 - else: - render_box.append(message) - if message.key_id == last_msg: - if current_page == 1: - output_file_name = f"{output_folder}/{safe_file_name}.html" - else: - output_file_name = f"{output_folder}/{safe_file_name}-{current_page}.html" - rendering( - output_file_name, - template, - name, - render_box, - contact, - w3css, - chat, - headline, - False, - previous=f"{safe_file_name}-{current_page - 1}.html" - ) - else: - output_file_name = f"{output_folder}/{safe_file_name}.html" - rendering( - output_file_name, - template, - name, - chat.get_messages(), - contact, - w3css, - chat, - headline, - False + _generate_paginated_chat( + current_chat, + safe_file_name, + name, + contact, + output_folder, + template, + w3css, + maximum_size, + headline ) + else: + _generate_single_chat( + current_chat, + safe_file_name, + name, + contact, + output_folder, + template, + w3css, + headline + ) + if current % 10 == 0: print(f"Generating chats...({current}/{total_row_number})", end="\r") print(f"Generating chats...({total_row_number}/{total_row_number})", end="\r") +def _generate_single_chat(current_chat, safe_file_name, name, contact, output_folder, template, w3css, headline): + """Generate a single HTML file for a chat.""" + output_file_name = f"{output_folder}/{safe_file_name}.html" + rendering( + output_file_name, + template, + name, + current_chat.values(), + contact, + w3css, + current_chat, + headline, + False + ) + + +def _generate_paginated_chat(current_chat, safe_file_name, name, contact, output_folder, template, w3css, maximum_size, headline): + """Generate multiple HTML files for a chat when pagination is required.""" + current_size = 0 + current_page = 1 + render_box = [] + + # Use default maximum size if set to 0 + if maximum_size == 0: + maximum_size = MAX_SIZE + + last_msg = current_chat.get_last_message().key_id + + for message in current_chat.values(): + # Calculate message size + if message.data is not None and not message.meta and not message.media: + current_size += len(message.data) + ROW_SIZE + else: + current_size += ROW_SIZE + 100 # Assume media and meta HTML are 100 bytes + + if current_size > maximum_size: + # Create a new page + output_file_name = f"{output_folder}/{safe_file_name}-{current_page}.html" + rendering( + output_file_name, + template, + name, + render_box, + contact, + w3css, + current_chat, + headline, + next=f"{safe_file_name}-{current_page + 1}.html", + previous=f"{safe_file_name}-{current_page - 1}.html" if current_page > 1 else False + ) + render_box = [message] + current_size = 0 + current_page += 1 + else: + render_box.append(message) + if message.key_id == last_msg: + # Last message, create final page + if current_page == 1: + output_file_name = f"{output_folder}/{safe_file_name}.html" + else: + output_file_name = f"{output_folder}/{safe_file_name}-{current_page}.html" + rendering( + output_file_name, + template, + name, + render_box, + contact, + w3css, + current_chat, + headline, + False, + previous=f"{safe_file_name}-{current_page - 1}.html" + ) + + def create_txt(data, output): + """Generate text files from chat data.""" os.makedirs(output, exist_ok=True) + for jik, chat in data.items(): + if len(chat) == 0: + continue + + # Determine file name if chat.name is not None: contact = chat.name.replace('/', '') else: contact = jik.replace('+', '') + output_file = os.path.join(output, f"{contact}.txt") + with open(output_file, "w", encoding="utf8") as f: - for message in chat.messages.values(): - date = datetime.fromtimestamp(message.timestamp).date() + for message in chat.values(): + # Skip metadata in text format if message.meta and message.mime != "media": - continue # Skip any metadata in text format - if message.from_me: - name = "You" - else: - name = message.sender if message.sender else contact - prefix = f"[{date} {message.time}] {name}: " - prefix_length = len(prefix) - if message.media and ("/" in message.mime or message.mime == "media"): - if message.data == "The media is missing": - message_text = "" - else: - message_text = f"" - else: - if message.data is None: - message_text = "" - else: - message_text = message.data.replace('
', f'\n{" " * prefix_length}') - if message.caption is not None: - message_text += "\n" + ' ' * len(prefix) + message.caption.replace('
', f'\n{" " * prefix_length}') - f.write(f"{prefix}{message_text}\n") + continue + + # Format the message + formatted_message = _format_message_for_txt(message, contact) + f.write(f"{formatted_message}\n") + +def _format_message_for_txt(message, contact): + """Format a message for text output.""" + date = datetime.fromtimestamp(message.timestamp).date() + + # Determine the sender name + if message.from_me: + name = "You" + else: + name = message.sender if message.sender else contact + + prefix = f"[{date} {message.time}] {name}: " + prefix_length = len(prefix) + + # Handle different message types + if message.media and ("/" in message.mime or message.mime == "media"): + if message.data == "The media is missing": + message_text = "" + else: + message_text = f"" + else: + if message.data is None: + message_text = "" + else: + message_text = message.data.replace('
', f'\n{" " * prefix_length}') + + # Add caption if present + if message.caption is not None: + message_text += "\n" + ' ' * len(prefix) + message.caption.replace('
', f'\n{" " * prefix_length}') + + return f"{prefix}{message_text}" diff --git a/Whatsapp_Chat_Exporter/bplist.py b/Whatsapp_Chat_Exporter/bplist.py index fddd964..390fe6e 100644 --- a/Whatsapp_Chat_Exporter/bplist.py +++ b/Whatsapp_Chat_Exporter/bplist.py @@ -234,7 +234,7 @@ class BPListReader(object): # read trailer self.offset_size, self.object_ref_size, self.number_of_objects, self.top_object, self.table_offset = struct.unpack('!6xBB4xI4xI4xI', self.data[-32:]) #print "** plist offset_size:",self.offset_size,"objref_size:",self.object_ref_size,"num_objs:",self.number_of_objects,"top:",self.top_object,"table_ofs:",self.table_offset - + # read offset table self.offset_table = self.data[self.table_offset:-32] self.offsets = [] diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index f08acca..e84154d 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -1,25 +1,172 @@ -#!/usr/bin/python3 - import os from datetime import datetime, tzinfo, timedelta -from typing import Union +from typing import MutableMapping, Union, Optional, Dict, Any + + +class Timing: + """ + Handles timestamp formatting with timezone support. + """ + def __init__(self, timezone_offset: Optional[int]) -> None: + """ + Initialize Timing object. + + Args: + timezone_offset (Optional[int]): Hours offset from UTC + """ + self.timezone_offset = timezone_offset + + def format_timestamp(self, timestamp: Optional[Union[int, float]], format: str) -> Optional[str]: + """ + Format a timestamp with the specified format string. + + Args: + timestamp (Optional[Union[int, float]]): Unix timestamp to format + format (str): strftime format string + + Returns: + Optional[str]: Formatted timestamp string, or None if timestamp is None + """ + if timestamp: + timestamp = timestamp / 1000 if timestamp > 9999999999 else timestamp + return datetime.fromtimestamp(timestamp, TimeZone(self.timezone_offset)).strftime(format) + return None class TimeZone(tzinfo): - def __init__(self, offset): + """ + Custom timezone class with fixed offset. + """ + def __init__(self, offset: int) -> None: + """ + Initialize TimeZone object. + + Args: + offset (int): Hours offset from UTC + """ self.offset = offset - def utcoffset(self, dt): - return timedelta(hours=self.offset) - def dst(self, dt): - return timedelta(0) + + def utcoffset(self, dt: Optional[datetime]) -> timedelta: + """Get UTC offset.""" + return timedelta(hours=self.offset) + + def dst(self, dt: Optional[datetime]) -> timedelta: + """Get DST offset (always 0).""" + return timedelta(0) -class ChatStore(): - def __init__(self, type, name=None, media=None): +class ChatCollection(MutableMapping): + """ + A collection of chats that provides dictionary-like access with additional chat management methods. + Inherits from MutableMapping to implement a custom dictionary-like behavior. + """ + + def __init__(self) -> None: + """Initialize an empty chat collection.""" + self._chats: Dict[str, ChatStore] = {} + + def __getitem__(self, key: str) -> 'ChatStore': + """Get a chat by its ID. Required for dict-like access.""" + return self._chats[key] + + def __setitem__(self, key: str, value: 'ChatStore') -> None: + """Set a chat by its ID. Required for dict-like access.""" + if not isinstance(value, ChatStore): + raise TypeError("Value must be a ChatStore object") + self._chats[key] = value + + def __delitem__(self, key: str) -> None: + """Delete a chat by its ID. Required for dict-like access.""" + del self._chats[key] + + def __iter__(self): + """Iterate over chat IDs. Required for dict-like access.""" + return iter(self._chats) + + def __len__(self) -> int: + """Get number of chats. Required for dict-like access.""" + return len(self._chats) + + def get_chat(self, chat_id: str) -> Optional['ChatStore']: + """ + Get a chat by its ID. + + Args: + chat_id (str): The ID of the chat to retrieve + + Returns: + Optional['ChatStore']: The chat if found, None otherwise + """ + return self._chats.get(chat_id) + + def add_chat(self, chat_id: str, chat: 'ChatStore') -> None: + """ + Add a new chat to the collection. + + Args: + chat_id (str): The ID for the chat + chat (ChatStore): The chat to add + + Raises: + TypeError: If chat is not a ChatStore object + """ + if not isinstance(chat, ChatStore): + raise TypeError("Chat must be a ChatStore object") + self._chats[chat_id] = chat + return self._chats[chat_id] + + def remove_chat(self, chat_id: str) -> None: + """ + Remove a chat from the collection. + + Args: + chat_id (str): The ID of the chat to remove + """ + if chat_id in self._chats: + del self._chats[chat_id] + + def items(self): + """Get chat items (id, chat) pairs.""" + return self._chats.items() + + def values(self): + """Get all chats.""" + return self._chats.values() + + def keys(self): + """Get all chat IDs.""" + return self._chats.keys() + + def to_dict(self) -> Dict[str, Any]: + """ + Convert the collection to a dictionary. + + Returns: + Dict[str, Any]: Dictionary representation of all chats + """ + return {chat_id: chat.to_json() for chat_id, chat in self._chats.items()} + + +class ChatStore: + """ + Stores chat information and messages. + """ + def __init__(self, type: str, name: Optional[str] = None, media: Optional[str] = None) -> None: + """ + Initialize ChatStore object. + + Args: + type (str): Device type (IOS or ANDROID) + name (Optional[str]): Chat name + media (Optional[str]): Path to media folder + + Raises: + TypeError: If name is not a string or None + """ if name is not None and not isinstance(name, str): raise TypeError("Name must be a string or None") self.name = name - self.messages = {} + self._messages: Dict[str, 'Message'] = {} self.type = type if media is not None: from Whatsapp_Chat_Exporter.utility import Device @@ -36,17 +183,27 @@ class ChatStore(): self.status = None self.media_base = "" - def add_message(self, id, message): + def __len__(self) -> int: + """Get number of chats. Required for dict-like access.""" + return len(self._messages) + + def add_message(self, id: str, message: 'Message') -> None: + """Add a message to the chat store.""" if not isinstance(message, Message): raise TypeError("message must be a Message object") - self.messages[id] = message + self._messages[id] = message + + def get_message(self, id: str) -> 'Message': + """Get a message from the chat store.""" + return self._messages.get(id) - def delete_message(self, id): - if id in self.messages: - del self.messages[id] + def delete_message(self, id: str) -> None: + """Delete a message from the chat store.""" + if id in self._messages: + del self._messages[id] - def to_json(self): - serialized_msgs = {id: msg.to_json() for id, msg in self.messages.items()} + def to_json(self) -> Dict[str, Any]: + """Convert chat store to JSON-serializable dict.""" return { 'name': self.name, 'type': self.type, @@ -54,26 +211,69 @@ class ChatStore(): 'their_avatar': self.their_avatar, 'their_avatar_thumb': self.their_avatar_thumb, 'status': self.status, - 'messages': serialized_msgs + 'messages': {id: msg.to_json() for id, msg in self._messages.items()} } - def get_last_message(self): - return tuple(self.messages.values())[-1] + def get_last_message(self) -> 'Message': + """Get the most recent message in the chat.""" + return tuple(self._messages.values())[-1] + + def items(self): + """Get message items pairs.""" + return self._messages.items() - def get_messages(self): - return self.messages.values() + def values(self): + """Get all messages in the chat.""" + return self._messages.values() + + def keys(self): + """Get all message keys in the chat.""" + return self._messages.keys() -class Message(): - def __init__(self, from_me: Union[bool,int], timestamp: int, time: Union[int,float,str], key_id: int, timezone_offset: int = 0, message_type: int = None): +class Message: + """ + Represents a single message in a chat. + """ + def __init__( + self, + *, + from_me: Union[bool, int], + timestamp: int, + time: Union[int, float, str], + key_id: int, + received_timestamp: int, + read_timestamp: int, + timezone_offset: int = 0, + message_type: Optional[int] = None + ) -> None: + """ + Initialize Message object. + + Args: + from_me (Union[bool, int]): Whether message was sent by the user + timestamp (int): Message timestamp + time (Union[int, float, str]): Message time + key_id (int): Message unique identifier + received_timestamp (int): When message was received + read_timestamp (int): When message was read + timezone_offset (int, optional): Hours offset from UTC. Defaults to 0 + message_type (Optional[int], optional): Type of message. Defaults to None + + Raises: + TypeError: If time is not a string or number + """ self.from_me = bool(from_me) self.timestamp = timestamp / 1000 if timestamp > 9999999999 else timestamp - if isinstance(time, int) or isinstance(time, float): - self.time = datetime.fromtimestamp(self.timestamp, TimeZone(timezone_offset)).strftime("%H:%M") + timing = Timing(timezone_offset) + + if isinstance(time, (int, float)): + self.time = timing.format_timestamp(self.timestamp, "%H:%M") elif isinstance(time, str): self.time = time else: raise TypeError("Time must be a string or number") + self.media = False self.key_id = key_id self.meta = False @@ -81,29 +281,33 @@ class Message(): self.sender = None self.safe = False self.mime = None - self.message_type = message_type - # Extra + self.message_type = message_type, + self.received_timestamp = timing.format_timestamp(received_timestamp, "%Y/%m/%d %H:%M") + self.read_timestamp = timing.format_timestamp(read_timestamp, "%Y/%m/%d %H:%M") + + # Extra attributes self.reply = None self.quoted_data = None self.caption = None - self.thumb = None # Android specific + self.thumb = None # Android specific self.sticker = False - - def to_json(self): + + def to_json(self) -> Dict[str, Any]: + """Convert message to JSON-serializable dict.""" return { - 'from_me' : self.from_me, - 'timestamp' : self.timestamp, - 'time' : self.time, - 'media' : self.media, - 'key_id' : self.key_id, - 'meta' : self.meta, - 'data' : self.data, - 'sender' : self.sender, - 'safe' : self.safe, - 'mime' : self.mime, - 'reply' : self.reply, - 'quoted_data' : self.quoted_data, - 'caption' : self.caption, - 'thumb' : self.thumb, - 'sticker' : self.sticker - } + 'from_me': self.from_me, + 'timestamp': self.timestamp, + 'time': self.time, + 'media': self.media, + 'key_id': self.key_id, + 'meta': self.meta, + 'data': self.data, + 'sender': self.sender, + 'safe': self.safe, + 'mime': self.mime, + 'reply': self.reply, + 'quoted_data': self.quoted_data, + 'caption': self.caption, + 'thumb': self.thumb, + 'sticker': self.sticker + } \ No newline at end of file diff --git a/Whatsapp_Chat_Exporter/exported_handler.py b/Whatsapp_Chat_Exporter/exported_handler.py index 0aae498..7215f6f 100644 --- a/Whatsapp_Chat_Exporter/exported_handler.py +++ b/Whatsapp_Chat_Exporter/exported_handler.py @@ -8,85 +8,174 @@ from Whatsapp_Chat_Exporter.utility import Device def messages(path, data, assume_first_as_me=False): - """Extracts messages from the exported file""" + """ + Extracts messages from an exported WhatsApp chat file. + + Args: + path: Path to the exported chat file + data: Data container object to store the parsed chat + assume_first_as_me: If True, assumes the first message is sent from the user without asking + + Returns: + Updated data container with extracted messages + """ + # Create a new chat in the data container + chat = data.add_chat("ExportedChat", ChatStore(Device.EXPORTED)) + you = "" # Will store the username of the current user + user_identification_done = False # Flag to track if user identification has been done + + # First pass: count total lines for progress reporting + with open(path, "r", encoding="utf8") as file: + total_row_number = sum(1 for _ in file) + + # Second pass: process the messages with open(path, "r", encoding="utf8") as file: - you = "" - data["ExportedChat"] = ChatStore(Device.EXPORTED) - chat = data["ExportedChat"] - total_row_number = len(file.readlines()) - file.seek(0) for index, line in enumerate(file): - if len(line.split(" - ")) > 1: - time = line.split(" - ")[0] - if ":" not in line.split(time)[1]: - msg.data = line.split(time)[1][3:] - msg.meta = True - else: - name = line.split(time)[1].split(":")[0] - message = line.split(time)[1].split(name + ":")[1].strip() - name = name[3:] - if you == "": - if chat.name is None: - if not assume_first_as_me: - while True: - ans = input(f"Is '{name}' you? (Y/N)").lower() - if ans == "y": - you = name - break - elif ans == "n": - chat.name = name - break - else: - you = name - else: - if name != chat.name: - you = name - elif chat.name is None: - if name != you: - chat.name = name - msg = Message( - you == name, - datetime.strptime(time, "%d/%m/%Y, %H:%M").timestamp(), - time.split(", ")[1].strip(), - index - ) - if "" in message: - msg.data = "The media is omitted in the chat" - msg.mime = "media" - msg.meta = True - elif "(file attached)" in message: - mime = MimeTypes() - msg.media = True - file_path = os.path.join(os.path.dirname(path), message.split("(file attached)")[0].strip()) - if os.path.isfile(file_path): - msg.data = file_path - guess = mime.guess_type(file_path)[0] - if guess is not None: - msg.mime = guess - else: - msg.mime = "application/octet-stream" - else: - msg.data = "The media is missing" - msg.mime = "media" - msg.meta = True - else: - msg.data = message - if "\r\n" in message: - msg.data = message.replace("\r\n", "
") - if "\n" in message: - msg.data = message.replace("\n", "
") - chat.add_message(index, msg) - else: - lookback = index - 1 - while lookback not in chat.messages: - lookback -= 1 - msg = chat.messages[lookback] - if msg.media: - msg.caption = line.strip() - else: - msg.data += "
" + line.strip() - + you, user_identification_done = process_line( + line, index, chat, path, you, + assume_first_as_me, user_identification_done + ) + + # Show progress if index % 1000 == 0: print(f"Processing messages & media...({index}/{total_row_number})", end="\r") - print(f"Processing messages & media...({total_row_number}/{total_row_number})", end="\r") + + print(f"Processing messages & media...({total_row_number}/{total_row_number})") return data + + +def process_line(line, index, chat, file_path, you, assume_first_as_me, user_identification_done): + """ + Process a single line from the chat file + + Returns: + Tuple of (updated_you_value, updated_user_identification_done_flag) + """ + parts = line.split(" - ", 1) + + # Check if this is a new message (has timestamp format) + if len(parts) > 1: + time = parts[0] + you, user_identification_done = process_new_message( + time, parts[1], index, chat, you, file_path, + assume_first_as_me, user_identification_done + ) + else: + # This is a continuation of the previous message + process_message_continuation(line, index, chat) + + return you, user_identification_done + + +def process_new_message(time, content, index, chat, you, file_path, + assume_first_as_me, user_identification_done): + """ + Process a line that contains a new message + + Returns: + Tuple of (updated_you_value, updated_user_identification_done_flag) + """ + # Create a new message + msg = Message( + from_me=False, # Will be updated later if needed + timestamp=datetime.strptime(time, "%d/%m/%Y, %H:%M").timestamp(), + time=time.split(", ")[1].strip(), + key_id=index, + received_timestamp=None, + read_timestamp=None + ) + + # Check if this is a system message (no name:message format) + if ":" not in content: + msg.data = content + msg.meta = True + else: + # Process user message + name, message = content.strip().split(":", 1) + + # Handle user identification + if you == "": + if chat.name is None: + # First sender identification + if not user_identification_done: + if not assume_first_as_me: + # Ask only once if this is the user + you = prompt_for_user_identification(name) + user_identification_done = True + else: + you = name + user_identification_done = True + else: + # If we know the chat name, anyone else must be "you" + if name != chat.name: + you = name + + # Set the chat name if needed + if chat.name is None and name != you: + chat.name = name + + # Determine if this message is from the current user + msg.from_me = (name == you) + + # Process message content + process_message_content(msg, message, file_path) + + chat.add_message(index, msg) + return you, user_identification_done + + +def process_message_content(msg, message, file_path): + """Process and set the content of a message based on its type""" + if "" in message: + msg.data = "The media is omitted in the chat" + msg.mime = "media" + msg.meta = True + elif "(file attached)" in message: + process_attached_file(msg, message, file_path) + else: + msg.data = message.replace("\r\n", "
").replace("\n", "
") + + +def process_attached_file(msg, message, file_path): + """Process an attached file in a message""" + mime = MimeTypes() + msg.media = True + + # Extract file path and check if it exists + file_name = message.split("(file attached)")[0].strip() + attached_file_path = os.path.join(os.path.dirname(file_path), file_name) + + if os.path.isfile(attached_file_path): + msg.data = attached_file_path + guess = mime.guess_type(attached_file_path)[0] + msg.mime = guess if guess is not None else "application/octet-stream" + else: + msg.data = "The media is missing" + msg.mime = "media" + msg.meta = True + + +def process_message_continuation(line, index, chat): + """Process a line that continues a previous message""" + # Find the previous message + lookback = index - 1 + while lookback not in chat.keys(): + lookback -= 1 + + msg = chat.get_message(lookback) + + # Add the continuation line to the message + if msg.media: + msg.caption = line.strip() + else: + msg.data += "
" + line.strip() + + +def prompt_for_user_identification(name): + """Ask the user if the given name is their username""" + while True: + ans = input(f"Is '{name}' you? (Y/N)").lower() + if ans == "y": + return name + elif ans == "n": + return "" \ No newline at end of file diff --git a/Whatsapp_Chat_Exporter/ios_handler.py b/Whatsapp_Chat_Exporter/ios_handler.py index 46a8f2b..7a15835 100644 --- a/Whatsapp_Chat_Exporter/ios_handler.py +++ b/Whatsapp_Chat_Exporter/ios_handler.py @@ -12,432 +12,591 @@ from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, def contacts(db, data): + """Process WhatsApp contacts with status information.""" c = db.cursor() - # Get status only lol c.execute("""SELECT count() FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") total_row_number = c.fetchone()[0] print(f"Pre-processing contacts...({total_row_number})") + c.execute("""SELECT ZWHATSAPPID, ZABOUTTEXT FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") content = c.fetchone() while content is not None: - if not content["ZWHATSAPPID"].endswith("@s.whatsapp.net"): - ZWHATSAPPID = content["ZWHATSAPPID"] + "@s.whatsapp.net" - data[ZWHATSAPPID] = ChatStore(Device.IOS) - data[ZWHATSAPPID].status = content["ZABOUTTEXT"] + zwhatsapp_id = content["ZWHATSAPPID"] + if not zwhatsapp_id.endswith("@s.whatsapp.net"): + zwhatsapp_id += "@s.whatsapp.net" + + current_chat = ChatStore(Device.IOS) + current_chat.status = content["ZABOUTTEXT"] + data.add_chat(zwhatsapp_id, current_chat) content = c.fetchone() +def process_contact_avatars(current_chat, media_folder, contact_id): + """Process and assign avatar images for a contact.""" + path = f'{media_folder}/Media/Profile/{contact_id.split("@")[0]}' + avatars = glob(f"{path}*") + + if 0 < len(avatars) <= 1: + current_chat.their_avatar = avatars[0] + else: + for avatar in avatars: + if avatar.endswith(".thumb") and current_chat.their_avatar_thumb is None: + current_chat.their_avatar_thumb = avatar + elif avatar.endswith(".jpg") and current_chat.their_avatar is None: + current_chat.their_avatar = avatar + + +def get_contact_name(content): + """Determine the appropriate contact name based on push name and partner name.""" + is_phone = content["ZPARTNERNAME"].replace("+", "").replace(" ", "").isdigit() + if content["ZPUSHNAME"] is None or (content["ZPUSHNAME"] and not is_phone): + return content["ZPARTNERNAME"] + else: + return content["ZPUSHNAME"] + + def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, filter_empty): + """Process WhatsApp messages and contacts from the database.""" c = db.cursor() cursor2 = db.cursor() - # Get contacts - c.execute( - f"""SELECT count() - FROM (SELECT DISTINCT ZCONTACTJID, - ZPARTNERNAME, - ZWAPROFILEPUSHNAME.ZPUSHNAME - FROM ZWACHATSESSION - INNER JOIN ZWAMESSAGE - ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK - LEFT JOIN ZWAPROFILEPUSHNAME - ON ZWACHATSESSION.ZCONTACTJID = ZWAPROFILEPUSHNAME.ZJID - LEFT JOIN ZWAGROUPMEMBER - ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK - WHERE 1=1 - {get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - {get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - GROUP BY ZCONTACTJID);""" - ) + + # Build the chat filter conditions + chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + date_filter = f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else '' + + # Process contacts first + contact_query = f""" + SELECT count() + FROM (SELECT DISTINCT ZCONTACTJID, + ZPARTNERNAME, + ZWAPROFILEPUSHNAME.ZPUSHNAME + FROM ZWACHATSESSION + INNER JOIN ZWAMESSAGE + ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK + LEFT JOIN ZWAPROFILEPUSHNAME + ON ZWACHATSESSION.ZCONTACTJID = ZWAPROFILEPUSHNAME.ZJID + LEFT JOIN ZWAGROUPMEMBER + ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK + WHERE 1=1 + {chat_filter_include} + {chat_filter_exclude} + GROUP BY ZCONTACTJID); + """ + c.execute(contact_query) total_row_number = c.fetchone()[0] print(f"Processing contacts...({total_row_number})") - c.execute( - f"""SELECT DISTINCT ZCONTACTJID, - ZPARTNERNAME, - ZWAPROFILEPUSHNAME.ZPUSHNAME - FROM ZWACHATSESSION - INNER JOIN ZWAMESSAGE - ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK - LEFT JOIN ZWAPROFILEPUSHNAME - ON ZWACHATSESSION.ZCONTACTJID = ZWAPROFILEPUSHNAME.ZJID - LEFT JOIN ZWAGROUPMEMBER - ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK - WHERE 1=1 - {get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - {get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - GROUP BY ZCONTACTJID;""" - ) + # Get distinct contacts + contacts_query = f""" + SELECT DISTINCT ZCONTACTJID, + ZPARTNERNAME, + ZWAPROFILEPUSHNAME.ZPUSHNAME + FROM ZWACHATSESSION + INNER JOIN ZWAMESSAGE + ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK + LEFT JOIN ZWAPROFILEPUSHNAME + ON ZWACHATSESSION.ZCONTACTJID = ZWAPROFILEPUSHNAME.ZJID + LEFT JOIN ZWAGROUPMEMBER + ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK + WHERE 1=1 + {chat_filter_include} + {chat_filter_exclude} + GROUP BY ZCONTACTJID; + """ + c.execute(contacts_query) + + # Process each contact content = c.fetchone() while content is not None: - is_phone = content["ZPARTNERNAME"].replace("+", "").replace(" ", "").isdigit() - if content["ZPUSHNAME"] is None or (content["ZPUSHNAME"] and not is_phone): - contact_name = content["ZPARTNERNAME"] - else: - contact_name = content["ZPUSHNAME"] + contact_name = get_contact_name(content) contact_id = content["ZCONTACTJID"] + + # Add or update chat if contact_id not in data: - data[contact_id] = ChatStore(Device.IOS, contact_name, media_folder) + current_chat = data.add_chat(contact_id, ChatStore(Device.IOS, contact_name, media_folder)) else: - data[contact_id].name = contact_name - data[contact_id].my_avatar = os.path.join(media_folder, "Media/Profile/Photo.jpg") - path = f'{media_folder}/Media/Profile/{contact_id.split("@")[0]}' - avatars = glob(f"{path}*") - if 0 < len(avatars) <= 1: - data[contact_id].their_avatar = avatars[0] - else: - for avatar in avatars: - if avatar.endswith(".thumb") and data[content["ZCONTACTJID"]].their_avatar_thumb is None: - data[contact_id].their_avatar_thumb = avatar - elif avatar.endswith(".jpg") and data[content["ZCONTACTJID"]].their_avatar is None: - data[contact_id].their_avatar = avatar + current_chat = data.get_chat(contact_id) + current_chat.name = contact_name + current_chat.my_avatar = os.path.join(media_folder, "Media/Profile/Photo.jpg") + + # Process avatar images + process_contact_avatars(current_chat, media_folder, contact_id) content = c.fetchone() - # Get message history - c.execute(f"""SELECT count() - FROM ZWAMESSAGE - INNER JOIN ZWACHATSESSION - ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK - LEFT JOIN ZWAGROUPMEMBER - ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK - WHERE 1=1 - {f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - {get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")}""") + # Get message count + message_count_query = f""" + SELECT count() + FROM ZWAMESSAGE + INNER JOIN ZWACHATSESSION + ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK + LEFT JOIN ZWAGROUPMEMBER + ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK + WHERE 1=1 + {date_filter} + {chat_filter_include} + {chat_filter_exclude} + """ + c.execute(message_count_query) total_row_number = c.fetchone()[0] print(f"Processing messages...(0/{total_row_number})", end="\r") - c.execute(f"""SELECT ZCONTACTJID, - ZWAMESSAGE.Z_PK, - ZISFROMME, - ZMESSAGEDATE, - ZTEXT, - ZMESSAGETYPE, - ZWAGROUPMEMBER.ZMEMBERJID, - ZMETADATA, - ZSTANZAID, - ZGROUPINFO - FROM ZWAMESSAGE - LEFT JOIN ZWAGROUPMEMBER - ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK - LEFT JOIN ZWAMEDIAITEM - ON ZWAMESSAGE.Z_PK = ZWAMEDIAITEM.ZMESSAGE - INNER JOIN ZWACHATSESSION - ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK - WHERE 1=1 - {f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - {get_chat_condition(filter_chat[1], False, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - ORDER BY ZMESSAGEDATE ASC;""") + + # Fetch messages + messages_query = f""" + SELECT ZCONTACTJID, + ZWAMESSAGE.Z_PK, + ZISFROMME, + ZMESSAGEDATE, + ZTEXT, + ZMESSAGETYPE, + ZWAGROUPMEMBER.ZMEMBERJID, + ZMETADATA, + ZSTANZAID, + ZGROUPINFO, + ZSENTDATE + FROM ZWAMESSAGE + LEFT JOIN ZWAGROUPMEMBER + ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK + LEFT JOIN ZWAMEDIAITEM + ON ZWAMESSAGE.Z_PK = ZWAMEDIAITEM.ZMESSAGE + INNER JOIN ZWACHATSESSION + ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK + WHERE 1=1 + {date_filter} + {chat_filter_include} + {chat_filter_exclude} + ORDER BY ZMESSAGEDATE ASC; + """ + c.execute(messages_query) + + # Process each message i = 0 content = c.fetchone() while content is not None: - ZCONTACTJID = content["ZCONTACTJID"] - Z_PK = content["Z_PK"] + contact_id = content["ZCONTACTJID"] + message_pk = content["Z_PK"] is_group_message = content["ZGROUPINFO"] is not None - if ZCONTACTJID not in data: - data[ZCONTACTJID] = ChatStore(Device.IOS) - path = f'{media_folder}/Media/Profile/{ZCONTACTJID.split("@")[0]}' - avatars = glob(f"{path}*") - if 0 < len(avatars) <= 1: - data[ZCONTACTJID].their_avatar = avatars[0] - else: - for avatar in avatars: - if avatar.endswith(".thumb"): - data[ZCONTACTJID].their_avatar_thumb = avatar - elif avatar.endswith(".jpg"): - data[ZCONTACTJID].their_avatar = avatar + + # Ensure chat exists + if contact_id not in data: + current_chat = data.add_chat(contact_id, ChatStore(Device.IOS)) + process_contact_avatars(current_chat, media_folder, contact_id) + else: + current_chat = data.get_chat(contact_id) + + # Create message object ts = APPLE_TIME + content["ZMESSAGEDATE"] message = Message( from_me=content["ZISFROMME"], timestamp=ts, - time=ts, # TODO: Could be bug + time=ts, key_id=content["ZSTANZAID"][:17], timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET, - message_type=content["ZMESSAGETYPE"] + message_type=content["ZMESSAGETYPE"], + received_timestamp=APPLE_TIME + content["ZSENTDATE"] if content["ZSENTDATE"] else None, + read_timestamp=None # TODO: Add timestamp ) - invalid = False - if is_group_message and content["ZISFROMME"] == 0: - name = None - if content["ZMEMBERJID"] is not None: - if content["ZMEMBERJID"] in data: - name = data[content["ZMEMBERJID"]].name - if "@" in content["ZMEMBERJID"]: - fallback = content["ZMEMBERJID"].split('@')[0] - else: - fallback = None - else: - fallback = None - message.sender = name or fallback - else: - message.sender = None - if content["ZMESSAGETYPE"] == 6: - # Metadata - if is_group_message: - # Group - if content["ZTEXT"] is not None: - # Chnaged name - try: - int(content["ZTEXT"]) - except ValueError: - msg = f"The group name changed to {content['ZTEXT']}" - message.data = msg - message.meta = True - else: - invalid = True - else: - message.data = None - else: - message.data = None - else: - # real message - if content["ZMETADATA"] is not None and content["ZMETADATA"].startswith(b"\x2a\x14"): - quoted = content["ZMETADATA"][2:19] - message.reply = quoted.decode() - cursor2.execute(f"""SELECT ZTEXT - FROM ZWAMESSAGE - WHERE ZSTANZAID LIKE '{message.reply}%'""") - quoted_content = cursor2.fetchone() - if quoted_content and "ZTEXT" in quoted_content: - message.quoted_data = quoted_content["ZTEXT"] - else: - message.quoted_data = None - if content["ZMESSAGETYPE"] == 15: # Sticker - message.sticker = True - - if content["ZISFROMME"] == 1: - if content["ZMESSAGETYPE"] == 14: - msg = "Message deleted" - message.meta = True - else: - msg = content["ZTEXT"] - if msg is not None: - if "\r\n" in msg: - msg = msg.replace("\r\n", "
") - if "\n" in msg: - msg = msg.replace("\n", "
") - else: - if content["ZMESSAGETYPE"] == 14: - msg = "Message deleted" - message.meta = True - else: - msg = content["ZTEXT"] - if msg is not None: - if "\r\n" in msg: - msg = msg.replace("\r\n", "
") - if "\n" in msg: - msg = msg.replace("\n", "
") - message.data = msg + + # Process message data + invalid = process_message_data(message, content, is_group_message, data, cursor2) + + # Add valid messages to chat if not invalid: - data[ZCONTACTJID].add_message(Z_PK, message) + current_chat.add_message(message_pk, message) + + # Update progress i += 1 if i % 1000 == 0: print(f"Processing messages...({i}/{total_row_number})", end="\r") content = c.fetchone() - print( - f"Processing messages...({total_row_number}/{total_row_number})", end="\r") + + print(f"Processing messages...({total_row_number}/{total_row_number})", end="\r") + + +def process_message_data(message, content, is_group_message, data, cursor2): + """Process and set message data from content row.""" + # Handle group sender info + if is_group_message and content["ZISFROMME"] == 0: + name = None + if content["ZMEMBERJID"] is not None: + if content["ZMEMBERJID"] in data: + name = data.get_chat(content["ZMEMBERJID"]).name + if "@" in content["ZMEMBERJID"]: + fallback = content["ZMEMBERJID"].split('@')[0] + else: + fallback = None + else: + fallback = None + message.sender = name or fallback + else: + message.sender = None + + # Handle metadata messages + if content["ZMESSAGETYPE"] == 6: + return process_metadata_message(message, content, is_group_message) + + # Handle quoted replies + if content["ZMETADATA"] is not None and content["ZMETADATA"].startswith(b"\x2a\x14") and False: + quoted = content["ZMETADATA"][2:19] + message.reply = quoted.decode() + cursor2.execute(f"""SELECT ZTEXT + FROM ZWAMESSAGE + WHERE ZSTANZAID LIKE '{message.reply}%'""") + quoted_content = cursor2.fetchone() + if quoted_content and "ZTEXT" in quoted_content: + message.quoted_data = quoted_content["ZTEXT"] + else: + message.quoted_data = None + + # Handle stickers + if content["ZMESSAGETYPE"] == 15: + message.sticker = True + + # Process message text + process_message_text(message, content) + + return False # Message is valid + + +def process_metadata_message(message, content, is_group_message): + """Process metadata messages (action_type 6).""" + if is_group_message: + # Group + if content["ZTEXT"] is not None: + # Changed name + try: + int(content["ZTEXT"]) + except ValueError: + msg = f"The group name changed to {content['ZTEXT']}" + message.data = msg + message.meta = True + return False # Valid message + else: + return True # Invalid message + else: + message.data = None + return False + else: + message.data = None + return False + + +def process_message_text(message, content): + """Process and format message text content.""" + if content["ZISFROMME"] == 1: + if content["ZMESSAGETYPE"] == 14: + msg = "Message deleted" + message.meta = True + else: + msg = content["ZTEXT"] + if msg is not None: + msg = msg.replace("\r\n", "
").replace("\n", "
") + else: + if content["ZMESSAGETYPE"] == 14: + msg = "Message deleted" + message.meta = True + else: + msg = content["ZTEXT"] + if msg is not None: + msg = msg.replace("\r\n", "
").replace("\n", "
") + + message.data = msg def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=False): + """Process media files from WhatsApp messages.""" c = db.cursor() - # Get media - c.execute(f"""SELECT count() - FROM ZWAMEDIAITEM - INNER JOIN ZWAMESSAGE - ON ZWAMEDIAITEM.ZMESSAGE = ZWAMESSAGE.Z_PK - INNER JOIN ZWACHATSESSION - ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK - LEFT JOIN ZWAGROUPMEMBER - ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK - WHERE 1=1 - {f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID","ZMEMBERJID"], "ZGROUPINFO", "ios")} - {get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - """) + + # Build filter conditions + chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID","ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + date_filter = f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else '' + + # Get media count + media_count_query = f""" + SELECT count() + FROM ZWAMEDIAITEM + INNER JOIN ZWAMESSAGE + ON ZWAMEDIAITEM.ZMESSAGE = ZWAMESSAGE.Z_PK + INNER JOIN ZWACHATSESSION + ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK + LEFT JOIN ZWAGROUPMEMBER + ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK + WHERE 1=1 + {date_filter} + {chat_filter_include} + {chat_filter_exclude} + """ + c.execute(media_count_query) total_row_number = c.fetchone()[0] print(f"\nProcessing media...(0/{total_row_number})", end="\r") - i = 0 - c.execute(f"""SELECT ZCONTACTJID, - ZMESSAGE, - ZMEDIALOCALPATH, - ZMEDIAURL, - ZVCARDSTRING, - ZMEDIAKEY, - ZTITLE - FROM ZWAMEDIAITEM - INNER JOIN ZWAMESSAGE - ON ZWAMEDIAITEM.ZMESSAGE = ZWAMESSAGE.Z_PK - INNER JOIN ZWACHATSESSION - ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK - LEFT JOIN ZWAGROUPMEMBER - ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK - WHERE ZMEDIALOCALPATH IS NOT NULL - {f'AND ZWAMESSAGE.ZMESSAGEDATE {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - {get_chat_condition(filter_chat[1], False, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - ORDER BY ZCONTACTJID ASC""") - content = c.fetchone() + + # Fetch media items + media_query = f""" + SELECT ZCONTACTJID, + ZMESSAGE, + ZMEDIALOCALPATH, + ZMEDIAURL, + ZVCARDSTRING, + ZMEDIAKEY, + ZTITLE + FROM ZWAMEDIAITEM + INNER JOIN ZWAMESSAGE + ON ZWAMEDIAITEM.ZMESSAGE = ZWAMESSAGE.Z_PK + INNER JOIN ZWACHATSESSION + ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK + LEFT JOIN ZWAGROUPMEMBER + ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK + WHERE ZMEDIALOCALPATH IS NOT NULL + {date_filter} + {chat_filter_include} + {chat_filter_exclude} + ORDER BY ZCONTACTJID ASC + """ + c.execute(media_query) + + # Process each media item mime = MimeTypes() + i = 0 + content = c.fetchone() while content is not None: - file_path = f"{media_folder}/Message/{content['ZMEDIALOCALPATH']}" - ZMESSAGE = content["ZMESSAGE"] - contact = data[content["ZCONTACTJID"]] - message = contact.messages[ZMESSAGE] - message.media = True - if contact.media_base == "": - contact.media_base = media_folder + "/" - if os.path.isfile(file_path): - message.data = '/'.join(file_path.split("/")[1:]) - if content["ZVCARDSTRING"] is None: - guess = mime.guess_type(file_path)[0] - if guess is not None: - message.mime = guess - else: - message.mime = "application/octet-stream" - else: - message.mime = content["ZVCARDSTRING"] - if separate_media: - chat_display_name = slugify(contact.name or message.sender \ - or content["ZCONTACTJID"].split('@')[0], True) - current_filename = file_path.split("/")[-1] - new_folder = os.path.join(media_folder, "separated", chat_display_name) - Path(new_folder).mkdir(parents=True, exist_ok=True) - new_path = os.path.join(new_folder, current_filename) - shutil.copy2(file_path, new_path) - message.data = '/'.join(new_path.split("\\")[1:]) - else: - message.data = "The media is missing" - message.mime = "media" - message.meta = True - if content["ZTITLE"] is not None: - message.caption = content["ZTITLE"] + process_media_item(content, data, media_folder, mime, separate_media) + + # Update progress i += 1 if i % 100 == 0: print(f"Processing media...({i}/{total_row_number})", end="\r") content = c.fetchone() - print( - f"Processing media...({total_row_number}/{total_row_number})", end="\r") + + print(f"Processing media...({total_row_number}/{total_row_number})", end="\r") + + +def process_media_item(content, data, media_folder, mime, separate_media): + """Process a single media item.""" + file_path = f"{media_folder}/Message/{content['ZMEDIALOCALPATH']}" + current_chat = data.get_chat(content["ZCONTACTJID"]) + message = current_chat.get_message(content["ZMESSAGE"]) + message.media = True + + if current_chat.media_base == "": + current_chat.media_base = media_folder + "/" + + if os.path.isfile(file_path): + message.data = '/'.join(file_path.split("/")[1:]) + + # Set MIME type + if content["ZVCARDSTRING"] is None: + guess = mime.guess_type(file_path)[0] + message.mime = guess if guess is not None else "application/octet-stream" + else: + message.mime = content["ZVCARDSTRING"] + + # Handle separate media option + if separate_media: + chat_display_name = slugify(current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0], True) + current_filename = file_path.split("/")[-1] + new_folder = os.path.join(media_folder, "separated", chat_display_name) + Path(new_folder).mkdir(parents=True, exist_ok=True) + new_path = os.path.join(new_folder, current_filename) + shutil.copy2(file_path, new_path) + message.data = '/'.join(new_path.split("\\")[1:]) + else: + # Handle missing media + message.data = "The media is missing" + message.mime = "media" + message.meta = True + + # Add caption if available + if content["ZTITLE"] is not None: + message.caption = content["ZTITLE"] def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): + """Process vCard contacts from WhatsApp messages.""" c = db.cursor() - c.execute(f"""SELECT DISTINCT ZWAVCARDMENTION.ZMEDIAITEM, - ZWAMEDIAITEM.ZMESSAGE, - ZCONTACTJID, - ZVCARDNAME, - ZVCARDSTRING - FROM ZWAVCARDMENTION - INNER JOIN ZWAMEDIAITEM - ON ZWAVCARDMENTION.ZMEDIAITEM = ZWAMEDIAITEM.Z_PK - INNER JOIN ZWAMESSAGE - ON ZWAMEDIAITEM.ZMESSAGE = ZWAMESSAGE.Z_PK - INNER JOIN ZWACHATSESSION - ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK - LEFT JOIN ZWAGROUPMEMBER - ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK - WHERE 1=1 - {f'AND ZWAMESSAGE.ZMESSAGEDATE {filter_date}' if filter_date is not None else ''} - {get_chat_condition(filter_chat[0], True, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")} - {get_chat_condition(filter_chat[1], False, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios")};""") + + # Build filter conditions + chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + date_filter = f'AND ZWAMESSAGE.ZMESSAGEDATE {filter_date}' if filter_date is not None else '' + + # Fetch vCard mentions + vcard_query = f""" + SELECT DISTINCT ZWAVCARDMENTION.ZMEDIAITEM, + ZWAMEDIAITEM.ZMESSAGE, + ZCONTACTJID, + ZVCARDNAME, + ZVCARDSTRING + FROM ZWAVCARDMENTION + INNER JOIN ZWAMEDIAITEM + ON ZWAVCARDMENTION.ZMEDIAITEM = ZWAMEDIAITEM.Z_PK + INNER JOIN ZWAMESSAGE + ON ZWAMEDIAITEM.ZMESSAGE = ZWAMESSAGE.Z_PK + INNER JOIN ZWACHATSESSION + ON ZWAMESSAGE.ZCHATSESSION = ZWACHATSESSION.Z_PK + LEFT JOIN ZWAGROUPMEMBER + ON ZWAMESSAGE.ZGROUPMEMBER = ZWAGROUPMEMBER.Z_PK + WHERE 1=1 + {date_filter} + {chat_filter_include} + {chat_filter_exclude} + """ + c.execute(vcard_query) contents = c.fetchall() total_row_number = len(contents) print(f"\nProcessing vCards...(0/{total_row_number})", end="\r") + + # Create vCards directory path = f'{media_folder}/Message/vCards' Path(path).mkdir(parents=True, exist_ok=True) + # Process each vCard for index, content in enumerate(contents): - file_paths = [] - vcard_names = content["ZVCARDNAME"].split("_$!!$_") - vcard_strings = content["ZVCARDSTRING"].split("_$!!$_") - - # If this is a list of contacts - if len(vcard_names) > len(vcard_strings): - vcard_names.pop(0) # Dismiss the first element, which is the group name - - for name, vcard_string in zip(vcard_names, vcard_strings): - file_name = "".join(x for x in name if x.isalnum()) - file_name = file_name.encode('utf-8')[:230].decode('utf-8', 'ignore') - file_path = os.path.join(path, f"{file_name}.vcf") - file_paths.append(file_path) - - if not os.path.isfile(file_path): - with open(file_path, "w", encoding="utf-8") as f: - f.write(vcard_string) - - vcard_summary = "This media include the following vCard file(s):
" - vcard_summary += " | ".join([f'{htmle(name)}' for name, fp in zip(vcard_names, file_paths)]) - message = data[content["ZCONTACTJID"]].messages[content["ZMESSAGE"]] - message.data = vcard_summary - message.mime = "text/x-vcard" - message.media = True - message.meta = True - message.safe = True + process_vcard_item(content, path, data) print(f"Processing vCards...({index + 1}/{total_row_number})", end="\r") +def process_vcard_item(content, path, data): + """Process a single vCard item.""" + file_paths = [] + vcard_names = content["ZVCARDNAME"].split("_$!!$_") + vcard_strings = content["ZVCARDSTRING"].split("_$!!$_") + + # If this is a list of contacts + if len(vcard_names) > len(vcard_strings): + vcard_names.pop(0) # Dismiss the first element, which is the group name + + # Save each vCard file + for name, vcard_string in zip(vcard_names, vcard_strings): + file_name = "".join(x for x in name if x.isalnum()) + file_name = file_name.encode('utf-8')[:230].decode('utf-8', 'ignore') + file_path = os.path.join(path, f"{file_name}.vcf") + file_paths.append(file_path) + + if not os.path.isfile(file_path): + with open(file_path, "w", encoding="utf-8") as f: + f.write(vcard_string) + + # Create vCard summary and update message + vcard_summary = "This media include the following vCard file(s):
" + vcard_summary += " | ".join([f'{htmle(name)}' for name, fp in zip(vcard_names, file_paths)]) + + message = data.get_chat(content["ZCONTACTJID"]).get_message(content["ZMESSAGE"]) + message.data = vcard_summary + message.mime = "text/x-vcard" + message.media = True + message.meta = True + message.safe = True + + def calls(db, data, timezone_offset, filter_chat): + """Process WhatsApp call records.""" c = db.cursor() - c.execute(f"""SELECT count() - FROM ZWACDCALLEVENT - WHERE 1=1 - {get_chat_condition(filter_chat[0], True, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios")} - {get_chat_condition(filter_chat[1], False, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios")}""") + + # Build filter conditions + chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") + chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") + + # Get call count + call_count_query = f""" + SELECT count() + FROM ZWACDCALLEVENT + WHERE 1=1 + {chat_filter_include} + {chat_filter_exclude} + """ + c.execute(call_count_query) total_row_number = c.fetchone()[0] if total_row_number == 0: return + print(f"\nProcessing calls...({total_row_number})", end="\r") - c.execute(f"""SELECT ZCALLIDSTRING, - ZGROUPCALLCREATORUSERJIDSTRING, - ZGROUPJIDSTRING, - ZDATE, - ZOUTCOME, - ZBYTESRECEIVED + ZBYTESSENT AS bytes_transferred, - ZDURATION, - ZVIDEO, - ZMISSED, - ZINCOMING - FROM ZWACDCALLEVENT - INNER JOIN ZWAAGGREGATECALLEVENT - ON ZWACDCALLEVENT.Z1CALLEVENTS = ZWAAGGREGATECALLEVENT.Z_PK - WHERE 1=1 - {get_chat_condition(filter_chat[0], True, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios")} - {get_chat_condition(filter_chat[1], False, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios")}""") + + # Fetch call records + calls_query = f""" + SELECT ZCALLIDSTRING, + ZGROUPCALLCREATORUSERJIDSTRING, + ZGROUPJIDSTRING, + ZDATE, + ZOUTCOME, + ZBYTESRECEIVED + ZBYTESSENT AS bytes_transferred, + ZDURATION, + ZVIDEO, + ZMISSED, + ZINCOMING + FROM ZWACDCALLEVENT + INNER JOIN ZWAAGGREGATECALLEVENT + ON ZWACDCALLEVENT.Z1CALLEVENTS = ZWAAGGREGATECALLEVENT.Z_PK + WHERE 1=1 + {chat_filter_include} + {chat_filter_exclude} + """ + c.execute(calls_query) + + # Create calls chat chat = ChatStore(Device.ANDROID, "WhatsApp Calls") + + # Process each call content = c.fetchone() while content is not None: - ts = APPLE_TIME + int(content["ZDATE"]) - call = Message( - from_me=content["ZINCOMING"] == 0, - timestamp=ts, - time=ts, - key_id=content["ZCALLIDSTRING"], - timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET - ) - _jid = content["ZGROUPCALLCREATORUSERJIDSTRING"] - name = data[_jid].name if _jid in data else None - if _jid is not None and "@" in _jid: - fallback = _jid.split('@')[0] - else: - fallback = None - call.sender = name or fallback - call.meta = True - call.data = ( - f"A {'group ' if content['ZGROUPJIDSTRING'] is not None else ''}" - f"{'video' if content['ZVIDEO'] == 1 else 'voice'} " - f"call {'to' if call.from_me else 'from'} " - f"{call.sender} was " - ) - if content['ZOUTCOME'] in (1, 4): - call.data += "not answered." if call.from_me else "missed." - elif content['ZOUTCOME'] == 2: - call.data += "failed." - elif content['ZOUTCOME'] == 0: - call_time = convert_time_unit(int(content['ZDURATION'])) - call_bytes = bytes_to_readable(content['bytes_transferred']) - call.data += ( - f"initiated and lasted for {call_time} " - f"with {call_bytes} data transferred." - ) - else: - call.data += "in an unknown state." - chat.add_message(call.key_id, call) + process_call_record(content, chat, data, timezone_offset) content = c.fetchone() - data["000000000000000"] = chat \ No newline at end of file + + # Add calls chat to data + data.add_chat("000000000000000", chat) + + +def process_call_record(content, chat, data, timezone_offset): + """Process a single call record.""" + ts = APPLE_TIME + int(content["ZDATE"]) + call = Message( + from_me=content["ZINCOMING"] == 0, + timestamp=ts, + time=ts, + key_id=content["ZCALLIDSTRING"], + timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET + ) + + # Set sender info + _jid = content["ZGROUPCALLCREATORUSERJIDSTRING"] + name = data.get_chat(_jid).name if _jid in data else None + if _jid is not None and "@" in _jid: + fallback = _jid.split('@')[0] + else: + fallback = None + call.sender = name or fallback + + # Set call metadata + call.meta = True + call.data = format_call_data(call, content) + + # Add call to chat + chat.add_message(call.key_id, call) + + +def format_call_data(call, content): + """Format call data message based on call attributes.""" + # Basic call info + call_data = ( + f"A {'group ' if content['ZGROUPJIDSTRING'] is not None else ''}" + f"{'video' if content['ZVIDEO'] == 1 else 'voice'} " + f"call {'to' if call.from_me else 'from'} " + f"{call.sender} was " + ) + + # Call outcome + if content['ZOUTCOME'] in (1, 4): + call_data += "not answered." if call.from_me else "missed." + elif content['ZOUTCOME'] == 2: + call_data += "failed." + elif content['ZOUTCOME'] == 0: + call_time = convert_time_unit(int(content['ZDURATION'])) + call_bytes = bytes_to_readable(content['bytes_transferred']) + call_data += ( + f"initiated and lasted for {call_time} " + f"with {call_bytes} data transferred." + ) + else: + call_data += "in an unknown state." + + return call_data \ No newline at end of file diff --git a/Whatsapp_Chat_Exporter/ios_media_handler.py b/Whatsapp_Chat_Exporter/ios_media_handler.py index dc817b6..a1dcd30 100644 --- a/Whatsapp_Chat_Exporter/ios_media_handler.py +++ b/Whatsapp_Chat_Exporter/ios_media_handler.py @@ -4,6 +4,7 @@ import shutil import sqlite3 import os import getpass +from sys import exit from Whatsapp_Chat_Exporter.utility import WhatsAppIdentifier from Whatsapp_Chat_Exporter.bplist import BPListReader try: @@ -14,143 +15,218 @@ else: support_encrypted = True -def extract_encrypted(base_dir, password, identifiers, decrypt_chunk_size): - print("Trying to decrypt the iOS backup...", end="") - backup = EncryptedBackup( - backup_directory=base_dir, - passphrase=password, - cleanup=False, - check_same_thread=False, - decrypt_chunk_size=decrypt_chunk_size - ) - print("Done\nDecrypting WhatsApp database...", end="") - try: - backup.extract_file( - relative_path=RelativePath.WHATSAPP_MESSAGES, - domain_like=identifiers.DOMAIN, - output_filename=identifiers.MESSAGE - ) - backup.extract_file( - relative_path=RelativePath.WHATSAPP_CONTACTS, - domain_like=identifiers.DOMAIN, - output_filename=identifiers.CONTACT - ) - backup.extract_file( - relative_path=RelativePath.WHATSAPP_CALLS, - domain_like=identifiers.DOMAIN, - output_filename=identifiers.CALL - ) - except ValueError: - print("Failed to decrypt backup: incorrect password?") - exit(7) - except FileNotFoundError: - print("Essential WhatsApp files are missing from the iOS backup.") - exit(6) - else: - print("Done") +class BackupExtractor: + """ + A class to handle the extraction of WhatsApp data from iOS backups, + including encrypted and unencrypted backups. + """ - def extract_progress_handler(file_id, domain, relative_path, n, total_files): - if n % 100 == 0: - print(f"Decrypting and extracting files...({n}/{total_files})", end="\r") - return True + def __init__(self, base_dir, identifiers, decrypt_chunk_size): + self.base_dir = base_dir + self.identifiers = identifiers + self.decrypt_chunk_size = decrypt_chunk_size - backup.extract_files( - domain_like=identifiers.DOMAIN, - output_folder=identifiers.DOMAIN, - preserve_folders=True, - filter_callback=extract_progress_handler - ) - print(f"All required files are decrypted and extracted. ", end="\n") - return backup - - -def is_encrypted(base_dir): - with sqlite3.connect(os.path.join(base_dir, "Manifest.db")) as f: - c = f.cursor() - try: - c.execute("""SELECT count() - FROM Files - """) - except sqlite3.OperationalError as e: - raise e # These error cannot be used to determine if the backup is encrypted - except sqlite3.DatabaseError: - return True + def extract(self): + """ + Extracts WhatsApp data from the backup based on whether it's encrypted or not. + """ + if self._is_encrypted(): + self._extract_encrypted_backup() else: - return False + self._extract_unencrypted_backup() + def _is_encrypted(self): + """ + Checks if the iOS backup is encrypted. -def extract_media(base_dir, identifiers, decrypt_chunk_size): - if is_encrypted(base_dir): + Returns: + bool: True if encrypted, False otherwise. + """ + with sqlite3.connect(os.path.join(self.base_dir, "Manifest.db")) as db: + c = db.cursor() + try: + c.execute("SELECT count() FROM Files") + c.fetchone() # Execute and fetch to trigger potential errors + except (sqlite3.OperationalError, sqlite3.DatabaseError): + return True + else: + return False + + def _extract_encrypted_backup(self): + """ + Handles the extraction of data from an encrypted iOS backup. + """ if not support_encrypted: print("You don't have the dependencies to handle encrypted backup.") print("Read more on how to deal with encrypted backup:") print("https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage") - return False + return + print("Encryption detected on the backup!") password = getpass.getpass("Enter the password for the backup:") - extract_encrypted(base_dir, password, identifiers, decrypt_chunk_size) - else: - wts_db = os.path.join(base_dir, identifiers.MESSAGE[:2], identifiers.MESSAGE) - contact_db = os.path.join(base_dir, identifiers.CONTACT[:2], identifiers.CONTACT) - call_db = os.path.join(base_dir, identifiers.CALL[:2], identifiers.CALL) - if not os.path.isfile(wts_db): - if identifiers is WhatsAppIdentifier: + self._decrypt_backup(password) + self._extract_decrypted_files() + + def _decrypt_backup(self, password): + """ + Decrypts the iOS backup using the provided password. + + Args: + password (str): The password for the encrypted backup. + """ + print("Trying to decrypt the iOS backup...", end="") + self.backup = EncryptedBackup( + backup_directory=self.base_dir, + passphrase=password, + cleanup=False, + check_same_thread=False, + decrypt_chunk_size=self.decrypt_chunk_size, + ) + print("Done\nDecrypting WhatsApp database...", end="") + try: + self.backup.extract_file( + relative_path=RelativePath.WHATSAPP_MESSAGES, + domain_like=self.identifiers.DOMAIN, + output_filename=self.identifiers.MESSAGE, + ) + self.backup.extract_file( + relative_path=RelativePath.WHATSAPP_CONTACTS, + domain_like=self.identifiers.DOMAIN, + output_filename=self.identifiers.CONTACT, + ) + self.backup.extract_file( + relative_path=RelativePath.WHATSAPP_CALLS, + domain_like=self.identifiers.DOMAIN, + output_filename=self.identifiers.CALL, + ) + except ValueError: + print("Failed to decrypt backup: incorrect password?") + exit(7) + except FileNotFoundError: + print( + "Essential WhatsApp files are missing from the iOS backup. " + "Perhapse you enabled end-to-end encryption for the backup? " + "See https://wts.knugi.dev/docs.html?dest=iose2e" + ) + exit(6) + else: + print("Done") + + def _extract_decrypted_files(self): + """Extract all WhatsApp files after decryption""" + def extract_progress_handler(file_id, domain, relative_path, n, total_files): + if n % 100 == 0: + print(f"Decrypting and extracting files...({n}/{total_files})", end="\r") + return True + + self.backup.extract_files( + domain_like=self.identifiers.DOMAIN, + output_folder=self.identifiers.DOMAIN, + preserve_folders=True, + filter_callback=extract_progress_handler + ) + print(f"All required files are decrypted and extracted. ", end="\n") + + def _extract_unencrypted_backup(self): + """ + Handles the extraction of data from an unencrypted iOS backup. + """ + self._copy_whatsapp_databases() + self._extract_media_files() + + def _copy_whatsapp_databases(self): + """ + Copies the WhatsApp message, contact, and call databases to the working directory. + """ + wts_db_path = os.path.join(self.base_dir, self.identifiers.MESSAGE[:2], self.identifiers.MESSAGE) + contact_db_path = os.path.join(self.base_dir, self.identifiers.CONTACT[:2], self.identifiers.CONTACT) + call_db_path = os.path.join(self.base_dir, self.identifiers.CALL[:2], self.identifiers.CALL) + + if not os.path.isfile(wts_db_path): + if self.identifiers is WhatsAppIdentifier: print("WhatsApp database not found.") else: print("WhatsApp Business database not found.") - exit() + print( + "Essential WhatsApp files are missing from the iOS backup. " + "Perhapse you enabled end-to-end encryption for the backup? " + "See https://wts.knugi.dev/docs.html?dest=iose2e" + ) + exit(1) else: - shutil.copyfile(wts_db, identifiers.MESSAGE) - if not os.path.isfile(contact_db): + shutil.copyfile(wts_db_path, self.identifiers.MESSAGE) + + if not os.path.isfile(contact_db_path): print("Contact database not found. Skipping...") else: - shutil.copyfile(contact_db, identifiers.CONTACT) - if not os.path.isfile(call_db): + shutil.copyfile(contact_db_path, self.identifiers.CONTACT) + + if not os.path.isfile(call_db_path): print("Call database not found. Skipping...") else: - shutil.copyfile(call_db, identifiers.CALL) - _wts_id = identifiers.DOMAIN - with sqlite3.connect(os.path.join(base_dir, "Manifest.db")) as manifest: + shutil.copyfile(call_db_path, self.identifiers.CALL) + + def _extract_media_files(self): + """ + Extracts media files from the unencrypted backup. + """ + _wts_id = self.identifiers.DOMAIN + with sqlite3.connect(os.path.join(self.base_dir, "Manifest.db")) as manifest: manifest.row_factory = sqlite3.Row c = manifest.cursor() - c.execute( - f"""SELECT count() - FROM Files - WHERE domain = '{_wts_id}'""" - ) + c.execute(f"SELECT count() FROM Files WHERE domain = '{_wts_id}'") total_row_number = c.fetchone()[0] print(f"Extracting WhatsApp files...(0/{total_row_number})", end="\r") - c.execute(f"""SELECT fileID, - relativePath, - flags, - file AS metadata, - ROW_NUMBER() OVER(ORDER BY relativePath) AS _index - FROM Files - WHERE domain = '{_wts_id}' - ORDER BY relativePath""") + c.execute( + f""" + SELECT fileID, relativePath, flags, file AS metadata, + ROW_NUMBER() OVER(ORDER BY relativePath) AS _index + FROM Files + WHERE domain = '{_wts_id}' + ORDER BY relativePath + """ + ) if not os.path.isdir(_wts_id): os.mkdir(_wts_id) + row = c.fetchone() while row is not None: - if row["relativePath"] == "": + if not row["relativePath"]: # Skip empty relative paths row = c.fetchone() continue + destination = os.path.join(_wts_id, row["relativePath"]) hashes = row["fileID"] folder = hashes[:2] flags = row["flags"] - if flags == 2: + + if flags == 2: # Directory try: os.mkdir(destination) except FileExistsError: pass - elif flags == 1: - shutil.copyfile(os.path.join(base_dir, folder, hashes), destination) + elif flags == 1: # File + shutil.copyfile(os.path.join(self.base_dir, folder, hashes), destination) metadata = BPListReader(row["metadata"]).parse() creation = metadata["$objects"][1]["Birth"] modification = metadata["$objects"][1]["LastModified"] os.utime(destination, (modification, modification)) + if row["_index"] % 100 == 0: print(f"Extracting WhatsApp files...({row['_index']}/{total_row_number})", end="\r") row = c.fetchone() print(f"Extracting WhatsApp files...({total_row_number}/{total_row_number})", end="\n") + + +def extract_media(base_dir, identifiers, decrypt_chunk_size): + """ + Extracts WhatsApp data (media, messages, contacts, calls) from an iOS backup. + + Args: + base_dir (str): The path to the iOS backup directory. + identifiers (WhatsAppIdentifier): An object containing WhatsApp file identifiers. + decrypt_chunk_size (int): The chunk size for decryption. + """ + extractor = BackupExtractor(base_dir, identifiers, decrypt_chunk_size) + extractor.extract() + diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 259c318..6d9fd36 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -1,3 +1,4 @@ +import sqlite3 import jinja2 import json import os @@ -9,6 +10,7 @@ from markupsafe import Markup from datetime import datetime, timedelta from enum import IntEnum from Whatsapp_Chat_Exporter.data_model import ChatStore +from typing import Dict, List, Optional, Tuple try: from enum import StrEnum, IntEnum except ImportError: @@ -26,7 +28,15 @@ ROW_SIZE = 0x3D0 CURRENT_TZ_OFFSET = datetime.now().astimezone().utcoffset().seconds / 3600 -def convert_time_unit(time_second: int): +def convert_time_unit(time_second: int) -> str: + """Converts a time duration in seconds to a human-readable string. + + Args: + time_second: The time duration in seconds. + + Returns: + str: A human-readable string representing the time duration. + """ time = str(timedelta(seconds=time_second)) if "day" not in time: if time_second < 1: @@ -46,11 +56,19 @@ def convert_time_unit(time_second: int): return time -def bytes_to_readable(size_bytes: int): - """From https://stackoverflow.com/a/14822210/9478891 +def bytes_to_readable(size_bytes: int) -> str: + """Converts a file size in bytes to a human-readable string with units. + + From https://stackoverflow.com/a/14822210/9478891 Authors: james-sapam & other contributors Licensed under CC BY-SA 3.0 See git commit logs for changes, if any. + + Args: + size_bytes: The file size in bytes. + + Returns: + A human-readable string representing the file size. """ if size_bytes == 0: return "0B" @@ -61,7 +79,18 @@ def bytes_to_readable(size_bytes: int): return "%s %s" % (s, size_name[i]) -def readable_to_bytes(size_str: str): +def readable_to_bytes(size_str: str) -> int: + """Converts a human-readable file size string to bytes. + + Args: + size_str: The human-readable file size string (e.g., "1024KB", "1MB", "2GB"). + + Returns: + The file size in bytes. + + Raises: + ValueError: If the input string is invalid. + """ SIZE_UNITS = { 'B': 1, 'KB': 1024, @@ -80,11 +109,28 @@ def readable_to_bytes(size_str: str): return int(number) * SIZE_UNITS[unit] -def sanitize_except(html): +def sanitize_except(html: str) -> Markup: + """Sanitizes HTML, only allowing
tag. + + Args: + html: The HTML string to sanitize. + + Returns: + A Markup object containing the sanitized HTML. + """ return Markup(sanitize(html, tags=["br"])) -def determine_day(last, current): +def determine_day(last: int, current: int) -> Optional[datetime.date]: + """Determines if the day has changed between two timestamps. Exposed to Jinja's environment. + + Args: + last: The timestamp of the previous message. + current: The timestamp of the current message. + + Returns: + The date of the current message if it's a different day than the last message, otherwise None. + """ last = datetime.fromtimestamp(last).date() current = datetime.fromtimestamp(current).date() if last == current: @@ -96,12 +142,12 @@ def determine_day(last, current): def check_update(): import urllib.request import json + import importlib from sys import platform - from .__init__ import __version__ - package_url_json = "https://pypi.org/pypi/whatsapp-chat-exporter/json" + PACKAGE_JSON = "https://pypi.org/pypi/whatsapp-chat-exporter/json" try: - raw = urllib.request.urlopen(package_url_json) + raw = urllib.request.urlopen(PACKAGE_JSON) except Exception: print("Failed to check for updates.") return 1 @@ -109,6 +155,7 @@ def check_update(): with raw: package_info = json.load(raw) latest_version = tuple(map(int, package_info["info"]["version"].split("."))) + __version__ = importlib.metadata.version("whatsapp_chat_exporter") current_version = tuple(map(int, __version__.split("."))) if current_version < latest_version: print("===============Update===============") @@ -168,7 +215,13 @@ class Device(StrEnum): EXPORTED = "exported" -def import_from_json(json_file, data): +def import_from_json(json_file: str, data: Dict[str, ChatStore]): + """Imports chat data from a JSON file into the data dictionary. + + Args: + json_file: The path to the JSON file. + data: The dictionary to store the imported chat data. + """ from Whatsapp_Chat_Exporter.data_model import ChatStore, Message with open(json_file, "r") as f: temp_data = json.loads(f.read()) @@ -182,10 +235,12 @@ def import_from_json(json_file, data): chat.status = chat_data.get("status") for id, msg in chat_data.get("messages").items(): message = Message( - msg["from_me"], - msg["timestamp"], - msg["time"], - msg["key_id"], + from_me=msg["from_me"], + timestamp=msg["timestamp"], + time=msg["time"], + key_id=msg["key_id"], + received_timestamp=msg.get("received_timestamp"), + read_timestamp=msg.get("read_timestamp") ) message.media = msg.get("media") message.meta = msg.get("meta") @@ -203,11 +258,31 @@ def import_from_json(json_file, data): print(f"Importing chats from JSON...({index + 1}/{total_row_number})", end="\r") -def sanitize_filename(file_name: str): +def sanitize_filename(file_name: str) -> str: + """Sanitizes a filename by removing invalid and unsafe characters. + + Args: + file_name: The filename to sanitize. + + Returns: + The sanitized filename. + """ return "".join(x for x in file_name if x.isalnum() or x in "- ") -def get_file_name(contact: str, chat: ChatStore): +def get_file_name(contact: str, chat: ChatStore) -> Tuple[str, str]: + """Generates a sanitized filename and contact name for a chat. + + Args: + contact: The contact identifier (e.g., a phone number or group ID). + chat: The ChatStore object for the chat. + + Returns: + A tuple containing the sanitized filename and the contact name. + + Raises: + ValueError: If the contact format is unexpected. + """ if "@" not in contact and contact not in ("000000000000000", "000000000000001", "ExportedChat"): raise ValueError("Unexpected contact format: " + contact) phone_number = contact.split('@')[0] @@ -227,11 +302,36 @@ def get_file_name(contact: str, chat: ChatStore): return sanitize_filename(file_name), name -def get_cond_for_empty(enable, jid_field: str, broadcast_field: str): +def get_cond_for_empty(enable: bool, jid_field: str, broadcast_field: str) -> str: + """Generates a SQL condition for filtering empty chats. + + Args: + enable: True to include non-empty chats, False to include empty chats. + jid_field: The name of the JID field in the SQL query. + broadcast_field: The column name of the broadcast field in the SQL query. + + Returns: + A SQL condition string. + """ return f"AND (chat.hidden=0 OR {jid_field}='status@broadcast' OR {broadcast_field}>0)" if enable else "" -def get_chat_condition(filter, include, columns, jid=None, platform=None): +def get_chat_condition(filter: Optional[List[str]], include: bool, columns: List[str], jid: Optional[str] = None, platform: Optional[str] = None) -> str: + """Generates a SQL condition for filtering chats based on inclusion or exclusion criteria. + + Args: + filter: A list of phone numbers to include or exclude. + include: True to include chats that match the filter, False to exclude them. + columns: A list of column names to check against the filter. + jid: The JID column name (used for group identification). + platform: The platform ("android" or "ios") for platform-specific JID queries. + + Returns: + A SQL condition string. + + Raises: + ValueError: If the column count is invalid or an unsupported platform is provided. + """ if filter is not None: conditions = [] if len(columns) < 2 and jid is not None: @@ -279,13 +379,16 @@ class DbType(StrEnum): CONTACT = "contact" -def brute_force_offset(max_iv=200, max_db=200): - for iv in range(0, max_iv): - for db in range(0, max_db): - yield iv, iv + 16, db +def determine_metadata(content: sqlite3.Row, init_msg: Optional[str]) -> Optional[str]: + """Determines the metadata of a message. + Args: + content (sqlite3.Row): A row from the messages table. + init_msg (Optional[str]): The initial message, if any. -def determine_metadata(content, init_msg): + Returns: + The metadata as a string or None if the type is unsupported. + """ msg = init_msg if init_msg else "" if content["is_me_joined"] == 1: # Override return f"You were added into the group by {msg}" @@ -333,7 +436,7 @@ def determine_metadata(content, init_msg): msg = "Someone joined this group by using a invite link" # TODO: Find out who elif content["action_type"] == 27: msg += " changed the group description to:
" - msg += content['data'].replace("\n", '
') + msg += (content['data'] or "Unknown").replace("\n", '
') elif content["action_type"] == 28: try: old = content['old_jid'].split('@')[0] @@ -366,7 +469,17 @@ def determine_metadata(content, init_msg): return msg -def get_status_location(output_folder, offline_static): +def get_status_location(output_folder: str, offline_static: str) -> str: + """ + Gets the location of the W3.CSS file, either from web or local storage. + + Args: + output_folder (str): The folder where offline static files will be stored. + offline_static (str): The subfolder name for static files. If falsy, returns web URL. + + Returns: + str: The path or URL to the W3.CSS file. + """ w3css = "https://www.w3schools.com/w3css/4/w3.css" if not offline_static: return w3css @@ -381,7 +494,18 @@ def get_status_location(output_folder, offline_static): w3css = os.path.join(offline_static, "w3.css") -def setup_template(template, no_avatar, experimental=False): +def setup_template(template: Optional[str], no_avatar: bool, experimental: bool = False) -> jinja2.Template: + """ + Sets up the Jinja2 template environment and loads the template. + + Args: + template (Optional[str]): Path to custom template file. If None, uses default template. + no_avatar (bool): Whether to disable avatar display in the template. + experimental (bool, optional): Whether to use experimental template features. Defaults to False. + + Returns: + jinja2.Template: The configured Jinja2 template object. + """ if template is None or experimental: template_dir = os.path.dirname(__file__) template_file = "whatsapp.html" if not experimental else template @@ -401,13 +525,17 @@ def setup_template(template, no_avatar, experimental=False): APPLE_TIME = 978307200 -def slugify(value, allow_unicode=False): +def slugify(value: str, allow_unicode: bool = False) -> str: """ + Convert text to ASCII-only slugs for URL-safe strings. Taken from https://github.com/django/django/blob/master/django/utils/text.py - Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated - dashes to single dashes. Remove characters that aren't alphanumerics, - underscores, or hyphens. Convert to lowercase. Also strip leading and - trailing whitespace, dashes, and underscores. + + Args: + value (str): The string to convert to a slug. + allow_unicode (bool, optional): Whether to allow Unicode characters. Defaults to False. + + Returns: + str: The slugified string with only alphanumerics, underscores, or hyphens. """ value = str(value) if allow_unicode: @@ -419,16 +547,17 @@ def slugify(value, allow_unicode=False): class WhatsAppIdentifier(StrEnum): - MESSAGE = "7c7fba66680ef796b916b067077cc246adacf01d" - CONTACT = "b8548dc30aa1030df0ce18ef08b882cf7ab5212f" - CALL = "1b432994e958845fffe8e2f190f26d1511534088" + MESSAGE = "7c7fba66680ef796b916b067077cc246adacf01d" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ChatStorage.sqlite + CONTACT = "b8548dc30aa1030df0ce18ef08b882cf7ab5212f" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ContactsV2.sqlite + CALL = "1b432994e958845fffe8e2f190f26d1511534088" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-CallHistory.sqlite DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsApp.shared" class WhatsAppBusinessIdentifier(StrEnum): - MESSAGE = "724bd3b98b18518b455a87c1f3ac3a0d189c4466" - CONTACT = "d7246a707f51ddf8b17ee2dddabd9e0a4da5c552" - DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared" + MESSAGE = "724bd3b98b18518b455a87c1f3ac3a0d189c4466" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ChatStorage.sqlite + CONTACT = "d7246a707f51ddf8b17ee2dddabd9e0a4da5c552" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ContactsV2.sqlite + CALL = "b463f7c4365eefc5a8723930d97928d4e907c603" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-CallHistory.sqlite + DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared" class JidType(IntEnum): PM = 0 diff --git a/Whatsapp_Chat_Exporter/whatsapp_new.html b/Whatsapp_Chat_Exporter/whatsapp_new.html index 7502a7b..2aa2b7e 100644 --- a/Whatsapp_Chat_Exporter/whatsapp_new.html +++ b/Whatsapp_Chat_Exporter/whatsapp_new.html @@ -123,6 +123,10 @@ .reply-box:active { background-color:rgb(200 202 205 / var(--tw-bg-opacity, 1)); } + .info-box-tooltip { + --tw-translate-x: -50%; + transform: translate(var(--tw-translate-x), var(--tw-translate-y)) rotate(var(--tw-rotate)) skewX(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y)); + }