diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f5e4351 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,39 @@ +name: Run Pytest on Dev Branch Push + +on: + push: + branches: + - dev + +jobs: + ci: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.13", "3.14"] + include: + - os: ubuntu-latest + python-version: "3.10" + - os: ubuntu-latest + python-version: "3.11" + - os: ubuntu-latest + python-version: "3.12" + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} on ${{ matrix.os }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[all] pytest nuitka + + - name: Run pytest + run: pytest diff --git a/.github/workflows/compile-binary.yml b/.github/workflows/compile-binary.yml index 04532bf..013bb34 100644 --- a/.github/workflows/compile-binary.yml +++ b/.github/workflows/compile-binary.yml @@ -7,6 +7,9 @@ on: permissions: contents: read + id-token: write + attestations: write + jobs: linux: @@ -20,7 +23,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pycryptodome vobject javaobj-py3 ordered-set zstandard nuitka==2.8.9 + pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka==2.8.9 pip install . - name: Build binary with Nuitka run: | @@ -28,6 +31,10 @@ jobs: --include-data-file=./Whatsapp_Chat_Exporter/whatsapp.html=./Whatsapp_Chat_Exporter/whatsapp.html \ --assume-yes-for-downloads Whatsapp_Chat_Exporter --output-filename=wtsexporter_linux_x64 sha256sum wtsexporter_linux_x64 + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v3 + with: + subject-path: ./wtsexporter_linux_x64 - uses: actions/upload-artifact@v6 with: name: binary-linux @@ -45,13 +52,17 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pycryptodome vobject javaobj-py3 ordered-set zstandard nuitka==2.8.9 + pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka==2.8.9 pip install . - name: Build binary with Nuitka run: | python -m nuitka --onefile --include-data-file=./Whatsapp_Chat_Exporter/whatsapp.html=./Whatsapp_Chat_Exporter/whatsapp.html --assume-yes-for-downloads Whatsapp_Chat_Exporter --output-filename=wtsexporter - copy wtsexporter.exe wtsexporter_x64.exe + Rename-Item -Path "wtsexporter.exe" -NewName "wtsexporter_x64.exe" Get-FileHash wtsexporter_x64.exe + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v3 + with: + subject-path: .\wtsexporter_x64.exe - uses: actions/upload-artifact@v6 with: name: binary-windows @@ -69,16 +80,21 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install pycryptodome vobject javaobj-py3 ordered-set zstandard nuitka==2.8.9 + pip install pycryptodome javaobj-py3 ordered-set zstandard nuitka==2.8.9 pip install . - name: Build binary with Nuitka run: | python -m nuitka --onefile \ --include-data-file=./Whatsapp_Chat_Exporter/whatsapp.html=./Whatsapp_Chat_Exporter/whatsapp.html \ - --assume-yes-for-downloads Whatsapp_Chat_Exporter --output-filename=wtsexporter_macos_x64 - shasum -a 256 wtsexporter_macos_x64 + --assume-yes-for-downloads Whatsapp_Chat_Exporter --output-filename=wtsexporter_macos_arm64 + shasum -a 256 wtsexporter_macos_arm64 + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v3 + with: + subject-path: ./wtsexporter_macos_arm64 - uses: actions/upload-artifact@v6 with: name: binary-macos path: | - ./wtsexporter_macos_x64 + ./wtsexporter_macos_arm64 + diff --git a/.gitignore b/.gitignore index 5831f34..c427c08 100644 --- a/.gitignore +++ b/.gitignore @@ -138,7 +138,9 @@ __main__ # Dev time intermidiates & temp files result/ +output/ WhatsApp/ +AppDomainGroup-group.net.whatsapp.WhatsApp.shared/ /*.db /*.db-* /myout diff --git a/LICENSE.django b/LICENSE.django deleted file mode 100644 index 4330494..0000000 --- a/LICENSE.django +++ /dev/null @@ -1,32 +0,0 @@ -The Whatsapp Chat Exporter is licensed under the MIT license. For more information, -refer to https://github.com/KnugiHK/WhatsApp-Chat-Exporter/wiki/Open-Source-Licenses. - ------- - -Copyright (c) Django Software Foundation and individual contributors. -All rights reserved. - -Redistribution and use in source and binary forms, with or without modification, -are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - 3. Neither the name of Django nor the names of its contributors may be used - to endorse or promote products derived from this software without - specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index fad1158..3566b9f 100644 --- a/README.md +++ b/README.md @@ -136,12 +136,10 @@ wtsexporter -i -b ~/Library/Application\ Support/MobileSync/Backup/[device id] ``` ## Results -After extracting, you will get these: -#### Private Message +After extracting, you will get this: + ![Private Message](imgs/pm.png) -#### Group Message -![Group Message](imgs/group.png) ## More options Invoke the wtsexporter with --help option will show you all options available. @@ -233,6 +231,19 @@ Contact Enrichment: Use with --enrich-from-vcards. When numbers in the vcf file does not have a country code, this will be used. 1 is for US, 66 for Thailand etc. Most likely use the number of your own country +Incremental Merging: + --incremental-merge Performs an incremental merge of two exports. Requires setting both --source- + dir and --target-dir. The chats (JSON files only) and media from the source + directory will be merged into the target directory. No chat messages or media + will be deleted from the target directory; only new chat messages and media + will be added to it. This enables chat messages and media to be deleted from + the device to free up space, while ensuring they are preserved in the exported + backups. + --source-dir SOURCE_DIR + Sets the source directory. Used for performing incremental merges. + --target-dir TARGET_DIR + Sets the target directory. Used for performing incremental merges. + Miscellaneous: -s, --showkey Show the HEX key used to decrypt the database --check-update Check for updates (require Internet access) @@ -243,10 +254,14 @@ Miscellaneous: --max-bruteforce-worker MAX_BRUTEFORCE_WORKER Specify the maximum number of worker for bruteforce decryption. -WhatsApp Chat Exporter: 0.12.1 Licensed with MIT. See https://wts.knugi.dev/docs?dest=osl for all open source +WhatsApp Chat Exporter: 0.13.0rc1 Licensed with MIT. See https://wts.knugi.dev/docs?dest=osl for all open source licenses. ``` +# Python Support Policy + +This project officially supports all non-EOL (End-of-Life) versions of Python. Once a Python version reaches EOL, it is dropped in the next release. See [Python's EOL Schedule](https://devguide.python.org/versions/). + # Legal Stuff & Disclaimer This is a MIT licensed project. diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index b9a8578..b844921 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -7,39 +7,60 @@ import shutil import json import string import glob +import logging import importlib.metadata from Whatsapp_Chat_Exporter import android_crypt, exported_handler, android_handler from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, Crypt, check_update, DbType -from Whatsapp_Chat_Exporter.utility import readable_to_bytes, sanitize_filename -from Whatsapp_Chat_Exporter.utility import import_from_json, bytes_to_readable +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, Crypt, check_update +from Whatsapp_Chat_Exporter.utility import readable_to_bytes, safe_name, bytes_to_readable +from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, DbType +from Whatsapp_Chat_Exporter.utility import telegram_json_format from argparse import ArgumentParser, SUPPRESS from datetime import datetime from getpass import getpass from sys import exit -from typing import Tuple, Optional, List, Dict, Any, Union +from typing import Optional, List, Dict +from Whatsapp_Chat_Exporter.vcards_contacts import ContactsFromVCards -# Try to import vobject for contacts processing -try: - import vobject -except ModuleNotFoundError: - vcards_deps_installed = False -else: - from Whatsapp_Chat_Exporter.vcards_contacts import ContactsFromVCards - vcards_deps_installed = True + +logger = logging.getLogger(__name__) +__version__ = importlib.metadata.version("whatsapp_chat_exporter") +WTSEXPORTER_BANNER = f"""======================================================================================================== + ██╗ ██╗██╗ ██╗ █████╗ ████████╗███████╗ █████╗ ██████╗ ██████╗ + ██║ ██║██║ ██║██╔══██╗╚══██╔══╝██╔════╝██╔══██╗██╔══██╗██╔══██╗ + ██║ █╗ ██║███████║███████║ ██║ ███████╗███████║██████╔╝██████╔╝ + ██║███╗██║██╔══██║██╔══██║ ██║ ╚════██║██╔══██║██╔═══╝ ██╔═══╝ + ╚███╔███╔╝██║ ██║██║ ██║ ██║ ███████║██║ ██║██║ ██║ + ╚══╝╚══╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝╚═╝ ╚═╝ + + ██████╗██╗ ██╗ █████╗ ████████╗ ███████╗██╗ ██╗██████╗ ██████╗ ██████╗ ████████╗███████╗██████╗ +██╔════╝██║ ██║██╔══██╗╚══██╔══╝ ██╔════╝╚██╗██╔╝██╔══██╗██╔═══██╗██╔══██╗╚══██╔══╝██╔════╝██╔══██╗ +██║ ███████║███████║ ██║ █████╗ ╚███╔╝ ██████╔╝██║ ██║██████╔╝ ██║ █████╗ ██████╔╝ +██║ ██╔══██║██╔══██║ ██║ ██╔══╝ ██╔██╗ ██╔═══╝ ██║ ██║██╔══██╗ ██║ ██╔══╝ ██╔══██╗ +╚██████╗██║ ██║██║ ██║ ██║ ███████╗██╔╝ ██╗██║ ╚██████╔╝██║ ██║ ██║ ███████╗██║ ██║ + ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚══════╝╚═╝ ╚═╝ + + WhatsApp Chat Exporter: A customizable Android and iOS/iPadOS WhatsApp database parser + Version: {__version__} +========================================================================================================""" def setup_argument_parser() -> ArgumentParser: """Set up and return the argument parser with all options.""" parser = ArgumentParser( description='A customizable Android and iOS/iPadOS WhatsApp database parser that ' - 'will give you the history of your WhatsApp conversations in HTML ' - 'and JSON. Android Backup Crypt12, Crypt14 and Crypt15 supported.', - epilog=f'WhatsApp Chat Exporter: {importlib.metadata.version("whatsapp_chat_exporter")} Licensed with MIT. See ' - 'https://wts.knugi.dev/docs?dest=osl for all open source licenses.' + 'will give you the history of your WhatsApp conversations in HTML ' + 'and JSON. Android Backup Crypt12, Crypt14 and Crypt15 supported.', + epilog=f'WhatsApp Chat Exporter: {__version__} Licensed with MIT. See ' + 'https://wts.knugi.dev/docs?dest=osl for all open source licenses.' + ) + + # General options + parser.add_argument( + "--debug", dest="debug", default=False, action='store_true', + help="Enable debug mode" ) - # Device type arguments device_group = parser.add_argument_group('Device Type') device_group.add_argument( @@ -54,7 +75,7 @@ def setup_argument_parser() -> ArgumentParser: "-e", "--exported", dest="exported", default=None, help="Define the target as exported chat file and specify the path to the file" ) - + # Input file paths input_group = parser.add_argument_group('Input Files') input_group.add_argument( @@ -86,7 +107,7 @@ def setup_argument_parser() -> ArgumentParser: "--wab", "--wa-backup", dest="wab", default=None, help="Path to contact database in crypt15 format" ) - + # Output options output_group = parser.add_argument_group('Output Options') output_group.add_argument( @@ -106,10 +127,14 @@ def setup_argument_parser() -> ArgumentParser: help="Do not output html files" ) output_group.add_argument( - "--size", "--output-size", "--split", dest="size", nargs='?', const=0, default=None, + "--size", "--output-size", "--split", dest="size", nargs='?', const="0", default=None, help="Maximum (rough) size of a single output file in bytes, 0 for auto" ) - + output_group.add_argument( + "--no-reply", dest="no_reply_ios", default=False, action='store_true', + help="Do not process replies (iOS only) (default: handle replies)" + ) + # JSON formatting options json_group = parser.add_argument_group('JSON Options') json_group.add_argument( @@ -120,6 +145,10 @@ def setup_argument_parser() -> ArgumentParser: '--pretty-print-json', dest='pretty_print_json', default=None, nargs='?', const=2, type=int, help="Pretty print the output JSON." ) + json_group.add_argument( + "--tg", "--telegram", dest="telegram", default=False, action='store_true', + help="Output the JSON in a format compatible with Telegram export (implies json-per-chat)" + ) json_group.add_argument( "--per-chat", dest="json_per_chat", default=False, action='store_true', help="Output the JSON file per chat" @@ -128,7 +157,7 @@ def setup_argument_parser() -> ArgumentParser: "--import", dest="import_json", default=False, action='store_true', help="Import JSON file and convert to HTML output" ) - + # HTML options html_group = parser.add_argument_group('HTML Options') html_group.add_argument( @@ -148,14 +177,14 @@ def setup_argument_parser() -> ArgumentParser: help="Do not render avatar in HTML output" ) html_group.add_argument( - "--experimental-new-theme", dest="whatsapp_theme", default=False, action='store_true', - help="Use the newly designed WhatsApp-alike theme" + "--old-theme", dest="telegram_theme", default=False, action='store_true', + help="Use the old Telegram-alike theme" ) html_group.add_argument( "--headline", dest="headline", default="Chat history with ??", help="The custom headline for the HTML output. Use '??' as a placeholder for the chat name" ) - + # Media handling media_group = parser.add_argument_group('Media Handling') media_group.add_argument( @@ -166,7 +195,7 @@ def setup_argument_parser() -> ArgumentParser: "--create-separated-media", dest="separate_media", default=False, action='store_true', help="Create a copy of the media seperated per chat in /separated/ directory" ) - + # Filtering options filter_group = parser.add_argument_group('Filtering Options') filter_group.add_argument( @@ -195,7 +224,7 @@ def setup_argument_parser() -> ArgumentParser: "Setting this flag will cause the exporter to render those. " "This is useful if chat(s) are missing from the output") ) - + # Contact enrichment contact_group = parser.add_argument_group('Contact Enrichment') contact_group.add_argument( @@ -206,7 +235,34 @@ def setup_argument_parser() -> ArgumentParser: "--default-country-code", dest="default_country_code", default=None, help="Use with --enrich-from-vcards. When numbers in the vcf file does not have a country code, this will be used. 1 is for US, 66 for Thailand etc. Most likely use the number of your own country" ) - + + # Incremental merging + inc_merging_group = parser.add_argument_group('Incremental Merging') + inc_merging_group.add_argument( + "--incremental-merge", + dest="incremental_merge", + default=False, + action='store_true', + help=("Performs an incremental merge of two exports. " + "Requires setting both --source-dir and --target-dir. " + "The chats (JSON files only) and media from the source directory will be merged into the target directory. " + "No chat messages or media will be deleted from the target directory; only new chat messages and media will be added to it. " + "This enables chat messages and media to be deleted from the device to free up space, while ensuring they are preserved in the exported backups." + ) + ) + inc_merging_group.add_argument( + "--source-dir", + dest="source_dir", + default=None, + help="Sets the source directory. Used for performing incremental merges." + ) + inc_merging_group.add_argument( + "--target-dir", + dest="target_dir", + default=None, + help="Sets the target directory. Used for performing incremental merges." + ) + # Miscellaneous misc_group = parser.add_argument_group('Miscellaneous') misc_group.add_argument( @@ -233,7 +289,11 @@ def setup_argument_parser() -> ArgumentParser: "--max-bruteforce-worker", dest="max_bruteforce_worker", default=10, type=int, help="Specify the maximum number of worker for bruteforce decryption." ) - + misc_group.add_argument( + "--no-banner", dest="no_banner", default=False, action='store_true', + help="Do not show the banner" + ) + return parser @@ -245,50 +305,60 @@ def validate_args(parser: ArgumentParser, args) -> None: if not args.android and not args.ios and not args.exported and not args.import_json: parser.error("You must define the device type.") if args.no_html and not args.json and not args.text_format: - parser.error("You must either specify a JSON output file, text file output directory or enable HTML output.") + parser.error( + "You must either specify a JSON output file, text file output directory or enable HTML output.") if args.import_json and (args.android or args.ios or args.exported or args.no_html): - parser.error("You can only use --import with -j and without --no-html, -a, -i, -e.") + parser.error( + "You can only use --import with -j and without --no-html, -a, -i, -e.") elif args.import_json and not os.path.isfile(args.json): parser.error("JSON file not found.") + if args.incremental_merge and (args.source_dir is None or args.target_dir is None): + parser.error( + "You must specify both --source-dir and --target-dir for incremental merge.") if args.android and args.business: parser.error("WhatsApp Business is only available on iOS for now.") if "??" not in args.headline: parser.error("--headline must contain '??' for replacement.") - + # JSON validation if args.json_per_chat and args.json and ( - (args.json.endswith(".json") and os.path.isfile(args.json)) or + (args.json.endswith(".json") and os.path.isfile(args.json)) or (not args.json.endswith(".json") and os.path.isfile(args.json)) ): - parser.error("When --per-chat is enabled, the destination of --json must be a directory.") - + parser.error( + "When --per-chat is enabled, the destination of --json must be a directory.") + # vCards validation if args.enrich_from_vcards is not None and args.default_country_code is None: - parser.error("When --enrich-from-vcards is provided, you must also set --default-country-code") - - # Size validation - if args.size is not None and not isinstance(args.size, int) and not args.size.isnumeric(): + parser.error( + "When --enrich-from-vcards is provided, you must also set --default-country-code") + + # Size validation and conversion + if args.size is not None: try: args.size = readable_to_bytes(args.size) except ValueError: - parser.error("The value for --split must be ended in pure bytes or with a proper unit (e.g., 1048576 or 1MB)") - + parser.error( + "The value for --split must be pure bytes or use a proper unit (e.g., 1048576 or 1MB)" + ) + # Date filter validation and processing if args.filter_date is not None: process_date_filter(parser, args) - + # Crypt15 key validation if args.key is None and args.backup is not None and args.backup.endswith("crypt15"): args.key = getpass("Enter your encryption key: ") - + # Theme validation - if args.whatsapp_theme: - args.template = "whatsapp_new.html" - + if args.telegram_theme: + args.template = "whatsapp_old.html" + # Chat filter validation if args.filter_chat_include is not None and args.filter_chat_exclude is not None: - parser.error("Chat inclusion and exclusion filters cannot be used together.") - + parser.error( + "Chat inclusion and exclusion filters cannot be used together.") + validate_chat_filters(parser, args.filter_chat_include) validate_chat_filters(parser, args.filter_chat_exclude) @@ -298,21 +368,24 @@ def validate_chat_filters(parser: ArgumentParser, chat_filter: Optional[List[str if chat_filter is not None: for chat in chat_filter: if not chat.isnumeric(): - parser.error("Enter a phone number in the chat filter. See https://wts.knugi.dev/docs?dest=chat") + parser.error( + "Enter a phone number in the chat filter. See https://wts.knugi.dev/docs?dest=chat") def process_date_filter(parser: ArgumentParser, args) -> None: """Process and validate date filter arguments.""" if " - " in args.filter_date: start, end = args.filter_date.split(" - ") - start = int(datetime.strptime(start, args.filter_date_format).timestamp()) + start = int(datetime.strptime( + start, args.filter_date_format).timestamp()) end = int(datetime.strptime(end, args.filter_date_format).timestamp()) - + if start < 1009843200 or end < 1009843200: parser.error("WhatsApp was first released in 2009...") if start > end: - parser.error("The start date cannot be a moment after the end date.") - + parser.error( + "The start date cannot be a moment after the end date.") + if args.android: args.filter_date = f"BETWEEN {start}000 AND {end}000" elif args.ios: @@ -324,13 +397,15 @@ def process_date_filter(parser: ArgumentParser, args) -> None: def process_single_date_filter(parser: ArgumentParser, args) -> None: """Process single date comparison filters.""" if len(args.filter_date) < 3: - parser.error("Unsupported date format. See https://wts.knugi.dev/docs?dest=date") - - _timestamp = int(datetime.strptime(args.filter_date[2:], args.filter_date_format).timestamp()) - + parser.error( + "Unsupported date format. See https://wts.knugi.dev/docs?dest=date") + + _timestamp = int(datetime.strptime( + args.filter_date[2:], args.filter_date_format).timestamp()) + if _timestamp < 1009843200: parser.error("WhatsApp was first released in 2009...") - + if args.filter_date[:2] == "> ": if args.android: args.filter_date = f">= {_timestamp}000" @@ -342,21 +417,16 @@ def process_single_date_filter(parser: ArgumentParser, args) -> None: elif args.ios: args.filter_date = f"<= {_timestamp - APPLE_TIME}" else: - parser.error("Unsupported date format. See https://wts.knugi.dev/docs?dest=date") + parser.error( + "Unsupported date format. See https://wts.knugi.dev/docs?dest=date") def setup_contact_store(args) -> Optional['ContactsFromVCards']: """Set up and return a contact store if needed.""" if args.enrich_from_vcards is not None: - if not vcards_deps_installed: - print( - "You don't have the dependency to enrich contacts with vCard.\n" - "Read more on how to deal with enriching contacts:\n" - "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage" - ) - exit(1) contact_store = ContactsFromVCards() - contact_store.load_vcf_file(args.enrich_from_vcards, args.default_country_code) + contact_store.load_vcf_file( + args.enrich_from_vcards, args.default_country_code) return contact_store return None @@ -364,11 +434,11 @@ def setup_contact_store(args) -> Optional['ContactsFromVCards']: def decrypt_android_backup(args) -> int: """Decrypt Android backup files and return error code.""" if args.key is None or args.backup is None: - print("You must specify the backup file with -b and a key with -k") + logger.error(f"You must specify the backup file with -b and a key with -k{CLEAR_LINE}") return 1 - - print("Decryption key specified, decrypting WhatsApp backup...") - + + logger.info(f"Decryption key specified, decrypting WhatsApp backup...{CLEAR_LINE}") + # Determine crypt type if "crypt12" in args.backup: crypt = Crypt.CRYPT12 @@ -377,9 +447,10 @@ def decrypt_android_backup(args) -> int: elif "crypt15" in args.backup: crypt = Crypt.CRYPT15 else: - print("Unknown backup format. The backup file must be crypt12, crypt14 or crypt15.") + logger.error( + f"Unknown backup format. The backup file must be crypt12, crypt14 or crypt15.{CLEAR_LINE}") return 1 - + # Get key keyfile_stream = False if not os.path.isfile(args.key) and all(char in string.hexdigits for char in args.key.replace(" ", "")): @@ -387,10 +458,10 @@ def decrypt_android_backup(args) -> int: else: key = open(args.key, "rb") keyfile_stream = True - + # Read backup db = open(args.backup, "rb").read() - + # Process WAB if provided error_wa = 0 if args.wab: @@ -407,7 +478,7 @@ def decrypt_android_backup(args) -> int: ) if isinstance(key, io.IOBase): key.seek(0) - + # Decrypt message database error_message = android_crypt.decrypt_backup( db, @@ -419,7 +490,7 @@ def decrypt_android_backup(args) -> int: keyfile_stream=keyfile_stream, max_worker=args.max_bruteforce_worker ) - + # Handle errors if error_wa != 0: return error_wa @@ -429,22 +500,22 @@ def decrypt_android_backup(args) -> int: def handle_decrypt_error(error: int) -> None: """Handle decryption errors with appropriate messages.""" if error == 1: - print("Dependencies of decrypt_backup and/or extract_encrypted_key" - " are not present. For details, see README.md.") + logger.error("Dependencies of decrypt_backup and/or extract_encrypted_key" + " are not present. For details, see README.md.\n") exit(3) elif error == 2: - print("Failed when decompressing the decrypted backup. " - "Possibly incorrect offsets used in decryption.") + logger.error("Failed when decompressing the decrypted backup. " + "Possibly incorrect offsets used in decryption.\n") exit(4) else: - print("Unknown error occurred.", error) + logger.error("Unknown error occurred.\n") exit(5) -def process_contacts(args, data: ChatCollection, contact_store=None) -> None: +def process_contacts(args, data: ChatCollection) -> None: """Process contacts from the database.""" contact_db = args.wa if args.wa else "wa.db" if args.android else "ContactsV2.sqlite" - + if os.path.isfile(contact_db): with sqlite3.connect(contact_db) as db: db.row_factory = sqlite3.Row @@ -457,42 +528,42 @@ def process_contacts(args, data: ChatCollection, contact_store=None) -> None: def process_messages(args, data: ChatCollection) -> None: """Process messages, media and vcards from the database.""" msg_db = args.db if args.db else "msgstore.db" if args.android else args.identifiers.MESSAGE - + if not os.path.isfile(msg_db): - print( + logger.error( "The message database does not exist. You may specify the path " - "to database file with option -d or check your provided path." + "to database file with option -d or check your provided path.\n" ) exit(6) - + filter_chat = (args.filter_chat_include, args.filter_chat_exclude) - + with sqlite3.connect(msg_db) as db: db.row_factory = sqlite3.Row - + # Process messages if args.android: message_handler = android_handler else: message_handler = ios_handler - + message_handler.messages( - db, data, args.media, args.timezone_offset, - args.filter_date, filter_chat, args.filter_empty + db, data, args.media, args.timezone_offset, args.filter_date, + filter_chat, args.filter_empty, args.no_reply_ios ) - + # Process media message_handler.media( - db, data, args.media, args.filter_date, + db, data, args.media, args.filter_date, filter_chat, args.filter_empty, args.separate_media ) - + # Process vcards message_handler.vcard( - db, data, args.media, args.filter_date, + db, data, args.media, args.filter_date, filter_chat, args.filter_empty ) - + # Process calls process_calls(args, db, data, filter_chat) @@ -511,30 +582,29 @@ def handle_media_directory(args) -> None: """Handle media directory copying or moving.""" if os.path.isdir(args.media): media_path = os.path.join(args.output, args.media) - + if os.path.isdir(media_path): - print("\nWhatsApp directory already exists in output directory. Skipping...", end="\n") + logger.info( + f"WhatsApp directory already exists in output directory. Skipping...{CLEAR_LINE}") else: if args.move_media: try: - print("\nMoving media directory...", end="\n") + logger.info(f"Moving media directory...\r") shutil.move(args.media, f"{args.output}/") + logger.info(f"Media directory has been moved to the output directory{CLEAR_LINE}") except PermissionError: - print("\nCannot remove original WhatsApp directory. " - "Perhaps the directory is opened?", end="\n") + logger.warning("Cannot remove original WhatsApp directory. " + "Perhaps the directory is opened?\n") else: - print("\nCopying media directory...", end="\n") + logger.info(f"Copying media directory...\r") shutil.copytree(args.media, media_path) + logger.info(f"Media directory has been copied to the output directory{CLEAR_LINE}") -def create_output_files(args, data: ChatCollection, contact_store=None) -> None: +def create_output_files(args, data: ChatCollection) -> None: """Create output files in the specified formats.""" # Create HTML files if requested if not args.no_html: - # Enrich from vcards if available - if contact_store and not contact_store.is_empty(): - contact_store.enrich_from_vcards(data) - android_handler.create_html( data, args.output, @@ -543,32 +613,29 @@ def create_output_files(args, data: ChatCollection, contact_store=None) -> None: args.offline, args.size, args.no_avatar, - args.whatsapp_theme, + args.telegram_theme, args.headline ) - + # Create text files if requested if args.text_format: - print("Writing text file...") + logger.info(f"Writing text file...{CLEAR_LINE}") android_handler.create_txt(data, args.text_format) - + # Create JSON files if requested if args.json and not args.import_json: - export_json(args, data, contact_store) + export_json(args, data) -def export_json(args, data: ChatCollection, contact_store=None) -> None: +def export_json(args, data: ChatCollection) -> None: """Export data to JSON format.""" - # Enrich from vcards if available - if contact_store and not contact_store.is_empty(): - contact_store.enrich_from_vcards(data) - + # TODO: remove all non-target chats from data if filtering is applied? # Convert ChatStore objects to JSON if isinstance(data.get(next(iter(data), None)), ChatStore): data = {jik: chat.to_json() for jik, chat in data.items()} - + # Export as a single file or per chat - if not args.json_per_chat: + if not args.json_per_chat and not args.telegram: export_single_json(args, data) else: export_multiple_json(args, data) @@ -582,19 +649,20 @@ def export_single_json(args, data: Dict) -> None: ensure_ascii=not args.avoid_encoding_json, indent=args.pretty_print_json ) - print(f"\nWriting JSON file...({bytes_to_readable(len(json_data))})") + logger.info(f"Writing JSON file...\r") f.write(json_data) + logger.info(f"JSON file saved...({bytes_to_readable(len(json_data))}){CLEAR_LINE}") def export_multiple_json(args, data: Dict) -> None: """Export data to multiple JSON files, one per chat.""" # Adjust output path if needed json_path = args.json[:-5] if args.json.endswith(".json") else args.json - + # Create directory if it doesn't exist if not os.path.isdir(json_path): os.makedirs(json_path, exist_ok=True) - + # Export each chat total = len(data.keys()) for index, jik in enumerate(data.keys()): @@ -602,22 +670,25 @@ def export_multiple_json(args, data: Dict) -> None: contact = data[jik]["name"].replace('/', '') else: contact = jik.replace('+', '') - - with open(f"{json_path}/{sanitize_filename(contact)}.json", "w") as f: + + if args.telegram: + messages = telegram_json_format(jik, data[jik], args.timezone_offset) + else: + messages = {jik: data[jik]} + with open(f"{json_path}/{safe_name(contact)}.json", "w") as f: file_content = json.dumps( - {jik: data[jik]}, - ensure_ascii=not args.avoid_encoding_json, + messages, + ensure_ascii=not args.avoid_encoding_json, indent=args.pretty_print_json ) f.write(file_content) - print(f"Writing JSON file...({index + 1}/{total})", end="\r") - print() + logger.info(f"Writing JSON file...({index + 1}/{total})\r") def process_exported_chat(args, data: ChatCollection) -> None: """Process an exported chat file.""" exported_handler.messages(args.exported, data, args.assume_first_as_me) - + if not args.no_html: android_handler.create_html( data, @@ -627,37 +698,61 @@ def process_exported_chat(args, data: ChatCollection) -> None: args.offline, args.size, args.no_avatar, - args.whatsapp_theme, + args.telegram_theme, args.headline ) - + # Copy files to output directory for file in glob.glob(r'*.*'): shutil.copy(file, args.output) +def setup_logging(level): + log_handler_stdout = logging.StreamHandler() + log_handler_stdout.terminator = "" + handlers = [log_handler_stdout] + if level == logging.DEBUG: + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + handlers.append(logging.FileHandler(f"wtsexpoter-debug-{timestamp}.log", mode="w")) + logging.basicConfig( + level=level, + format="[%(levelname)s] %(message)s", + handlers=handlers + ) + + def main(): """Main function to run the WhatsApp Chat Exporter.""" # Set up and parse arguments parser = setup_argument_parser() args = parser.parse_args() - + # Check for updates if args.check_update: exit(check_update()) - + # Validate arguments validate_args(parser, args) - + + # Print banner if not suppressed + if not args.no_banner: + print(WTSEXPORTER_BANNER) + + if args.debug: + setup_logging(logging.DEBUG) + logger.debug("Debug mode enabled.\n") + else: + setup_logging(logging.INFO) + # Create output directory if it doesn't exist os.makedirs(args.output, exist_ok=True) - + # Initialize data collection data = ChatCollection() - + # Set up contact store for vCard enrichment if needed contact_store = setup_contact_store(args) - + if args.import_json: # Import from JSON import_from_json(args.json, data) @@ -669,7 +764,7 @@ def main(): args.offline, args.size, args.no_avatar, - args.whatsapp_theme, + args.telegram_theme, args.headline ) elif args.exported: @@ -681,13 +776,13 @@ def main(): # Set default media path if not provided if args.media is None: args.media = "WhatsApp" - + # Set default DB paths if not provided if args.db is None: args.db = "msgstore.db" if args.wa is None: args.wa = "wa.db" - + # Decrypt backup if needed if args.key is not None: error = decrypt_android_backup(args) @@ -700,37 +795,54 @@ def main(): else: from Whatsapp_Chat_Exporter.utility import WhatsAppIdentifier as identifiers args.identifiers = identifiers - + # Set default media path if not provided if args.media is None: args.media = identifiers.DOMAIN - + # Extract media from backup if needed if args.backup is not None: if not os.path.isdir(args.media): - ios_media_handler.extract_media(args.backup, identifiers, args.decrypt_chunk_size) + ios_media_handler.extract_media( + args.backup, identifiers, args.decrypt_chunk_size) else: - print("WhatsApp directory already exists, skipping WhatsApp file extraction.") - + logger.info( + f"WhatsApp directory already exists, skipping WhatsApp file extraction.{CLEAR_LINE}") + # Set default DB paths if not provided if args.db is None: args.db = identifiers.MESSAGE if args.wa is None: args.wa = "ContactsV2.sqlite" - - # Process contacts - process_contacts(args, data, contact_store) - - # Process messages, media, and calls - process_messages(args, data) - - # Create output files - create_output_files(args, data, contact_store) - - # Handle media directory - handle_media_directory(args) - print("Everything is done!") + if args.incremental_merge: + incremental_merge( + args.source_dir, + args.target_dir, + args.media, + args.pretty_print_json, + args.avoid_encoding_json + ) + logger.info(f"Incremental merge completed successfully.{CLEAR_LINE}") + else: + # Process contacts + process_contacts(args, data) + + # Enrich contacts from vCards if needed + if args.android and contact_store and not contact_store.is_empty(): + contact_store.enrich_from_vcards(data) + + # Process messages, media, and calls + process_messages(args, data) + + # Create output files + create_output_files(args, data) + + # Handle media directory + handle_media_directory(args) + + logger.info("Everything is done!") + if __name__ == "__main__": main() diff --git a/Whatsapp_Chat_Exporter/android_crypt.py b/Whatsapp_Chat_Exporter/android_crypt.py index 84e629e..3e921d1 100644 --- a/Whatsapp_Chat_Exporter/android_crypt.py +++ b/Whatsapp_Chat_Exporter/android_crypt.py @@ -1,11 +1,14 @@ +import time import hmac import io +import logging +import threading import zlib import concurrent.futures from typing import Tuple, Union from hashlib import sha256 from sys import exit -from Whatsapp_Chat_Exporter.utility import CRYPT14_OFFSETS, Crypt, DbType +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CRYPT14_OFFSETS, Crypt, DbType try: import zlib @@ -23,6 +26,9 @@ else: support_crypt15 = True +logger = logging.getLogger(__name__) + + class DecryptionError(Exception): """Base class for decryption-related exceptions.""" pass @@ -115,6 +121,7 @@ def _decrypt_database(db_ciphertext: bytes, main_key: bytes, iv: bytes) -> bytes ) return db + def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> bytes: """Decrypt a crypt14 database using multithreading for brute-force offset detection. @@ -138,11 +145,28 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> iv = database[offsets["iv"]:offsets["iv"] + 16] db_ciphertext = database[offsets["db"]:] try: - return _decrypt_database(db_ciphertext, main_key, iv) + decrypted_db = _decrypt_database(db_ciphertext, main_key, iv) except (zlib.error, ValueError): pass # Try next offset + else: + logger.debug( + f"Decryption successful with known offsets: IV {offsets['iv']}, DB {offsets['db']}{CLEAR_LINE}" + ) + return decrypted_db # Successful decryption - print("Common offsets failed. Initiating brute-force with multithreading...") + def animate_message(stop_event): + base_msg = "Common offsets failed. Initiating brute-force with multithreading" + dots = ["", ".", "..", "..."] + i = 0 + while not stop_event.is_set(): + logger.info(f"{base_msg}{dots[i % len(dots)]}\x1b[K\r") + time.sleep(0.3) + i += 1 + logger.info(f"Common offsets failed but brute-forcing the offset works!{CLEAR_LINE}") + + stop_event = threading.Event() + anim_thread = threading.Thread(target=animate_message, args=(stop_event,)) + anim_thread.start() # Convert brute force generator into a list for parallel processing offset_combinations = list(brute_force_offset()) @@ -152,22 +176,27 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> start_iv, end_iv, start_db = offset_tuple iv = database[start_iv:end_iv] db_ciphertext = database[start_db:] + logger.debug(""f"Trying offsets: IV {start_iv}-{end_iv}, DB {start_db}{CLEAR_LINE}") try: db = _decrypt_database(db_ciphertext, main_key, iv) - print( + except (zlib.error, ValueError): + return None # Decryption failed, move to next + else: + stop_event.set() + anim_thread.join() + logger.info( f"The offsets of your IV and database are {start_iv} and " f"{start_db}, respectively. To include your offsets in the " "program, please report it by creating an issue on GitHub: " "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/discussions/47" - "\nShutting down other threads..." + f"\nShutting down other threads...{CLEAR_LINE}" ) return db - except (zlib.error, ValueError): - return None # Decryption failed, move to next with concurrent.futures.ThreadPoolExecutor(max_worker) as executor: - future_to_offset = {executor.submit(attempt_decrypt, offset): offset for offset in offset_combinations} + future_to_offset = {executor.submit(attempt_decrypt, offset) + : offset for offset in offset_combinations} try: for future in concurrent.futures.as_completed(future_to_offset): @@ -178,14 +207,18 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> return result except KeyboardInterrupt: - print("\nBrute force interrupted by user (Ctrl+C). Exiting gracefully...") + stop_event.set() + anim_thread.join() + logger.info(f"Brute force interrupted by user (Ctrl+C). Shutting down gracefully...{CLEAR_LINE}") executor.shutdown(wait=False, cancel_futures=True) exit(1) + finally: + stop_event.set() + anim_thread.join() raise OffsetNotFoundError("Could not find the correct offsets for decryption.") - def _decrypt_crypt12(database: bytes, main_key: bytes) -> bytes: """Decrypt a crypt12 database. @@ -287,7 +320,7 @@ def decrypt_backup( if crypt is not Crypt.CRYPT15 and len(key) != 158: raise InvalidKeyError("The key file must be 158 bytes") - #signature check, this is check is used in crypt 12 and 14 + # signature check, this is check is used in crypt 12 and 14 if crypt != Crypt.CRYPT15: t1 = key[30:62] @@ -297,7 +330,6 @@ def decrypt_backup( if t1 != database[3:35] and crypt == Crypt.CRYPT12: raise ValueError("The signature of key file and backup file mismatch") - if crypt == Crypt.CRYPT15: if keyfile_stream: main_key, hex_key = _extract_enc_key(key) @@ -305,7 +337,7 @@ def decrypt_backup( main_key, hex_key = _derive_main_enc_key(key) if show_crypt15: hex_key_str = ' '.join([hex_key.hex()[c:c+4] for c in range(0, len(hex_key.hex()), 4)]) - print(f"The HEX key of the crypt15 backup is: {hex_key_str}") + logger.info(f"The HEX key of the crypt15 backup is: {hex_key_str}{CLEAR_LINE}") else: main_key = key[126:] @@ -321,7 +353,6 @@ def decrypt_backup( except (InvalidFileFormatError, OffsetNotFoundError, ValueError) as e: raise DecryptionError(f"Decryption failed: {e}") from e - if not dry_run: with open(output, "wb") as f: f.write(db) diff --git a/Whatsapp_Chat_Exporter/android_handler.py b/Whatsapp_Chat_Exporter/android_handler.py index 5133d6c..274661b 100644 --- a/Whatsapp_Chat_Exporter/android_handler.py +++ b/Whatsapp_Chat_Exporter/android_handler.py @@ -1,5 +1,6 @@ #!/usr/bin/python3 +import logging import sqlite3 import os import shutil @@ -9,36 +10,41 @@ from markupsafe import escape as htmle from base64 import b64decode, b64encode from datetime import datetime from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata -from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify, bytes_to_readable +from Whatsapp_Chat_Exporter.utility import get_chat_condition, safe_name, bytes_to_readable + + +logger = logging.getLogger(__name__) def contacts(db, data, enrich_from_vcards): """ Process WhatsApp contacts from the database. - + Args: db: Database connection data: Data store object enrich_from_vcards: Path to vCard file for contact enrichment - + Returns: bool: False if no contacts found, True otherwise """ c = db.cursor() c.execute("SELECT count() FROM wa_contacts") total_row_number = c.fetchone()[0] - + if total_row_number == 0: if enrich_from_vcards is not None: - print("No contacts profiles found in the default database, contacts will be imported from the specified vCard file.") + logger.info( + "No contacts profiles found in the default database, contacts will be imported from the specified vCard file.") else: - print("No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") + logger.warning( + "No contacts profiles found in the default database, consider using --enrich-from-vcards for adopting names from exported contacts from Google") return False else: - print(f"Processing contacts...({total_row_number})") + logger.info(f"Processed {total_row_number} contacts\n") c.execute("SELECT jid, COALESCE(display_name, wa_name) as display_name, status FROM wa_contacts;") row = c.fetchone() @@ -47,14 +53,14 @@ def contacts(db, data, enrich_from_vcards): if row["status"] is not None: current_chat.status = row["status"] row = c.fetchone() - + return True -def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, filter_empty): +def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, filter_empty, no_reply): """ Process WhatsApp messages from the database. - + Args: db: Database connection data: Data store object @@ -66,7 +72,7 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """ c = db.cursor() total_row_number = _get_message_count(c, filter_empty, filter_date, filter_chat) - print(f"Processing messages...(0/{total_row_number})", end="\r") + logger.info(f"Processing messages...(0/{total_row_number})\r") try: content_cursor = _get_messages_cursor_legacy(c, filter_empty, filter_date, filter_chat) @@ -81,18 +87,18 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, i = 0 # Fetch the first row safely content = _fetch_row_safely(content_cursor) - + while content is not None: _process_single_message(data, content, table_message, timezone_offset) - + i += 1 if i % 1000 == 0: - print(f"Processing messages...({i}/{total_row_number})", end="\r") - + logger.info(f"Processing messages...({i}/{total_row_number})\r") + # Fetch the next row safely content = _fetch_row_safely(content_cursor) - - print(f"Processing messages...({total_row_number}/{total_row_number})", end="\r") + + logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}") # Helper functions for message processing @@ -102,8 +108,10 @@ def _get_message_count(cursor, filter_empty, filter_date, filter_chat): try: empty_filter = get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push") date_filter = f'AND timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid", "android") cursor.execute(f"""SELECT count() FROM messages @@ -119,8 +127,10 @@ def _get_message_count(cursor, filter_empty, filter_date, filter_chat): except sqlite3.OperationalError: empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast") date_filter = f'AND timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") cursor.execute(f"""SELECT count() FROM message @@ -142,8 +152,10 @@ def _get_messages_cursor_legacy(cursor, filter_empty, filter_date, filter_chat): """Get cursor for legacy database schema.""" empty_filter = get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "messages.remote_resource"], "jid_global", "android") cursor.execute(f"""SELECT messages.key_remote_jid, messages._id, @@ -205,8 +217,10 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat): """Get cursor for new database schema.""" empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") cursor.execute(f"""SELECT jid_global.raw_string as key_remote_jid, message._id, @@ -288,19 +302,18 @@ def _process_single_message(data, content, table_message, timezone_offset): """Process a single message row.""" if content["key_remote_jid"] is None: return - + # Get or create the chat - if not data.get_chat(content["key_remote_jid"]): - current_chat = data.add_chat(content["key_remote_jid"], ChatStore(Device.ANDROID, content["chat_subject"])) - else: - current_chat = data.get_chat(content["key_remote_jid"]) - + current_chat = data.get_chat(content["key_remote_jid"]) + if current_chat is None: + current_chat = data.add_chat(content["key_remote_jid"], ChatStore( + Device.ANDROID, content["chat_subject"])) # Determine sender_jid_row_id if "sender_jid_row_id" in content: sender_jid_row_id = content["sender_jid_row_id"] else: sender_jid_row_id = None - + # Create message object message = Message( from_me=not sender_jid_row_id and content["key_from_me"], @@ -312,19 +325,19 @@ def _process_single_message(data, content, table_message, timezone_offset): received_timestamp=content["received_timestamp"], read_timestamp=content["read_timestamp"] ) - + # Handle binary data if isinstance(content["data"], bytes): _process_binary_message(message, content) current_chat.add_message(content["_id"], message) return - + # Set sender for group chats if content["jid_type"] == JidType.GROUP and content["key_from_me"] == 0: _set_group_sender(message, content, data, table_message) else: message.sender = None - + # Handle quoted messages if content["quoted"] is not None: message.reply = content["quoted"] @@ -334,7 +347,7 @@ def _process_single_message(data, content, table_message, timezone_offset): message.quoted_data = content["quoted_data"] else: message.reply = None - + # Handle message caption if not table_message and content["media_caption"] is not None: # Old schema @@ -344,14 +357,14 @@ def _process_single_message(data, content, table_message, timezone_offset): message.caption = content["data"] else: message.caption = None - + # Handle message content based on status if content["status"] == 6: # 6 = Metadata _process_metadata_message(message, content, data, table_message) else: # Real message _process_regular_message(message, content, table_message) - + current_chat.add_message(content["_id"], message) @@ -381,7 +394,7 @@ def _set_group_sender(message, content, data, table_message): name = data.get_chat(content["remote_resource"]).name if "@" in content["remote_resource"]: fallback = content["remote_resource"].split('@')[0] - + message.sender = name or fallback @@ -389,7 +402,7 @@ def _process_metadata_message(message, content, data, table_message): """Process metadata message.""" message.meta = True name = fallback = None - + if table_message: if content["sender_jid_row_id"] > 0: _jid = content["group_sender_jid"] @@ -408,12 +421,12 @@ def _process_metadata_message(message, content, data, table_message): fallback = _jid.split('@')[0] else: name = "You" - + message.data = determine_metadata(content, name or fallback) - + if isinstance(message.data, str) and "
" in message.data: message.safe = True - + if message.data is None: if content["video_call"] is not None: # Missed call message.meta = True @@ -429,7 +442,7 @@ def _process_metadata_message(message, content, data, table_message): def _process_regular_message(message, content, table_message): """Process regular (non-metadata) message.""" message.sticker = content["media_wa_type"] == 20 # Sticker is a message - + if content["key_from_me"] == 1: if content["status"] == 5 and content["edit_version"] == 7 or table_message and content["media_wa_type"] == 15: msg = "Message deleted" @@ -454,7 +467,7 @@ def _process_regular_message(message, content, table_message): msg = content["data"] if msg is not None: msg = _format_message_text(msg) - + message.data = msg @@ -470,7 +483,7 @@ def _format_message_text(text): def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=True): """ Process WhatsApp media files from the database. - + Args: db: Database connection data: Data store object @@ -482,30 +495,30 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa """ c = db.cursor() total_row_number = _get_media_count(c, filter_empty, filter_date, filter_chat) - print(f"\nProcessing media...(0/{total_row_number})", end="\r") - + logger.info(f"Processing media...(0/{total_row_number})\r") + try: content_cursor = _get_media_cursor_legacy(c, filter_empty, filter_date, filter_chat) except sqlite3.OperationalError: content_cursor = _get_media_cursor_new(c, filter_empty, filter_date, filter_chat) - + content = content_cursor.fetchone() mime = MimeTypes() - + # Ensure thumbnails directory exists Path(f"{media_folder}/thumbnails").mkdir(parents=True, exist_ok=True) - + i = 0 while content is not None: _process_single_media(data, content, media_folder, mime, separate_media) - + i += 1 if i % 100 == 0: - print(f"Processing media...({i}/{total_row_number})", end="\r") - + logger.info(f"Processing media...({i}/{total_row_number})\r") + content = content_cursor.fetchone() - - print(f"Processing media...({total_row_number}/{total_row_number})", end="\r") + + logger.info(f"Processed {total_row_number} media{CLEAR_LINE}") # Helper functions for media processing @@ -515,8 +528,10 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat): try: empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") cursor.execute(f"""SELECT count() FROM message_media @@ -534,8 +549,10 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat): except sqlite3.OperationalError: empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") cursor.execute(f"""SELECT count() FROM message_media @@ -557,10 +574,12 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat): def _get_media_cursor_legacy(cursor, filter_empty, filter_date, filter_chat): """Get cursor for legacy media database schema.""" - empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") + empty_filter = get_cond_for_empty(filter_empty, "messages.key_remote_jid", "messages.needs_push") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") cursor.execute(f"""SELECT messages.key_remote_jid, message_row_id, @@ -592,8 +611,10 @@ def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat): """Get cursor for new media database schema.""" empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' - include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") - exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + include_filter = get_chat_condition( + filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") cursor.execute(f"""SELECT jid.raw_string as key_remote_jid, message_row_id, @@ -629,10 +650,10 @@ def _process_single_media(data, content, media_folder, mime, separate_media): current_chat = data.get_chat(content["key_remote_jid"]) message = current_chat.get_message(content["message_row_id"]) message.media = True - + if os.path.isfile(file_path): message.data = file_path - + # Set mime type if content["mime_type"] is None: guess = mime.guess_type(file_path)[0] @@ -642,11 +663,11 @@ def _process_single_media(data, content, media_folder, mime, separate_media): message.mime = "application/octet-stream" else: message.mime = content["mime_type"] - + # Copy media to separate folder if needed if separate_media: - chat_display_name = slugify(current_chat.name or message.sender - or content["key_remote_jid"].split('@')[0], True) + chat_display_name = safe_name(current_chat.name or message.sender + or content["key_remote_jid"].split('@')[0]) current_filename = file_path.split("/")[-1] new_folder = os.path.join(media_folder, "separated", chat_display_name) Path(new_folder).mkdir(parents=True, exist_ok=True) @@ -657,7 +678,7 @@ def _process_single_media(data, content, media_folder, mime, separate_media): message.data = "The media is missing" message.mime = "media" message.meta = True - + # Handle thumbnail if content["thumbnail"] is not None: thumb_path = f"{media_folder}/thumbnails/{b64decode(content['file_hash']).hex()}.png" @@ -676,23 +697,26 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): rows = _execute_vcard_query_legacy(c, filter_date, filter_chat, filter_empty) total_row_number = len(rows) - print(f"\nProcessing vCards...(0/{total_row_number})", end="\r") - + logger.info(f"Processing vCards...(0/{total_row_number})\r") + # Create vCards directory if it doesn't exist path = os.path.join(media_folder, "vCards") Path(path).mkdir(parents=True, exist_ok=True) - + for index, row in enumerate(rows): _process_vcard_row(row, path, data) - print(f"Processing vCards...({index + 1}/{total_row_number})", end="\r") + logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r") + logger.info(f"Processed {total_row_number} vCards{CLEAR_LINE}") def _execute_vcard_query_modern(c, filter_date, filter_chat, filter_empty): """Execute vCard query for modern WhatsApp database schema.""" # Build the filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push") @@ -721,8 +745,10 @@ def _execute_vcard_query_legacy(c, filter_date, filter_chat, filter_empty): """Execute vCard query for legacy WhatsApp database schema.""" # Build the filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") @@ -755,11 +781,11 @@ def _process_vcard_row(row, path, data): file_name = "".join(x for x in media_name if x.isalnum()) file_name = file_name.encode('utf-8')[:230].decode('utf-8', 'ignore') file_path = os.path.join(path, f"{file_name}.vcf") - + if not os.path.isfile(file_path): with open(file_path, "w", encoding="utf-8") as f: f.write(row["vcard"]) - + message = data.get_chat(row["key_remote_jid"]).get_message(row["message_row_id"]) message.data = "This media include the following vCard file(s):
" \ f'{htmle(media_name)}' @@ -771,28 +797,29 @@ def _process_vcard_row(row, path, data): def calls(db, data, timezone_offset, filter_chat): """Process call logs from WhatsApp database.""" c = db.cursor() - + # Check if there are any calls that match the filter total_row_number = _get_calls_count(c, filter_chat) if total_row_number == 0: return - - print(f"\nProcessing calls...({total_row_number})", end="\r") - + + logger.info(f"Processing calls...({total_row_number})\r") + # Fetch call data calls_data = _fetch_calls_data(c, filter_chat) - + # Create a chat store for all calls chat = ChatStore(Device.ANDROID, "WhatsApp Calls") - + # Process each call content = calls_data.fetchone() while content is not None: _process_call_record(content, chat, data, timezone_offset) content = calls_data.fetchone() - + # Add the calls chat to the data data.add_chat("000000000000000", chat) + logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}") def _get_calls_count(c, filter_chat): @@ -855,7 +882,7 @@ def _process_call_record(content, chat, data, timezone_offset): received_timestamp=None, # TODO: Add timestamp read_timestamp=None # TODO: Add timestamp ) - + # Get caller/callee name _jid = content["raw_string"] name = data.get_chat(_jid).name if _jid in data else content["chat_subject"] or None @@ -864,13 +891,13 @@ def _process_call_record(content, chat, data, timezone_offset): else: fallback = None call.sender = name or fallback - + # Set metadata call.meta = True - + # Construct call description based on call type and result call.data = _construct_call_description(content, call) - + # Add call to chat chat.add_message(content["_id"], call) @@ -882,7 +909,7 @@ def _construct_call_description(content, call): f"call {'to' if call.from_me else 'from'} " f"{call.sender} was " ) - + if content['call_result'] in (0, 4, 7): description += "cancelled." if call.from_me else "missed." elif content['call_result'] == 2: @@ -898,26 +925,26 @@ def _construct_call_description(content, call): ) else: description += "in an unknown state." - + return description def create_html( - data, - output_folder, - template=None, - embedded=False, - offline_static=False, - maximum_size=None, - no_avatar=False, - experimental=False, - headline=None - ): + data, + output_folder, + template=None, + embedded=False, + offline_static=False, + maximum_size=None, + no_avatar=False, + experimental=False, + headline=None +): """Generate HTML chat files from data.""" template = setup_template(template, no_avatar, experimental) total_row_number = len(data) - print(f"\nGenerating chats...(0/{total_row_number})", end="\r") + logger.info(f"Generating chats...(0/{total_row_number})\r") # Create output directory if it doesn't exist if not os.path.isdir(output_folder): @@ -930,37 +957,37 @@ def create_html( if len(current_chat) == 0: # Skip empty chats continue - + safe_file_name, name = get_file_name(contact, current_chat) if maximum_size is not None: _generate_paginated_chat( - current_chat, - safe_file_name, - name, - contact, - output_folder, - template, - w3css, - maximum_size, + current_chat, + safe_file_name, + name, + contact, + output_folder, + template, + w3css, + maximum_size, headline ) else: _generate_single_chat( - current_chat, - safe_file_name, - name, - contact, - output_folder, - template, - w3css, + current_chat, + safe_file_name, + name, + contact, + output_folder, + template, + w3css, headline ) - - if current % 10 == 0: - print(f"Generating chats...({current}/{total_row_number})", end="\r") - print(f"Generating chats...({total_row_number}/{total_row_number})", end="\r") + if current % 10 == 0: + logger.info(f"Generating chats...({current}/{total_row_number})\r") + + logger.info(f"Generated {total_row_number} chats{CLEAR_LINE}") def _generate_single_chat(current_chat, safe_file_name, name, contact, output_folder, template, w3css, headline): @@ -984,20 +1011,20 @@ def _generate_paginated_chat(current_chat, safe_file_name, name, contact, output current_size = 0 current_page = 1 render_box = [] - + # Use default maximum size if set to 0 if maximum_size == 0: maximum_size = MAX_SIZE - + last_msg = current_chat.get_last_message().key_id - + for message in current_chat.values(): # Calculate message size if message.data is not None and not message.meta and not message.media: current_size += len(message.data) + ROW_SIZE else: current_size += ROW_SIZE + 100 # Assume media and meta HTML are 100 bytes - + if current_size > maximum_size: # Create a new page output_file_name = f"{output_folder}/{safe_file_name}-{current_page}.html" @@ -1041,25 +1068,25 @@ def _generate_paginated_chat(current_chat, safe_file_name, name, contact, output def create_txt(data, output): """Generate text files from chat data.""" os.makedirs(output, exist_ok=True) - + for jik, chat in data.items(): if len(chat) == 0: continue - + # Determine file name if chat.name is not None: contact = chat.name.replace('/', '') else: contact = jik.replace('+', '') - + output_file = os.path.join(output, f"{contact}.txt") - + with open(output_file, "w", encoding="utf8") as f: for message in chat.values(): # Skip metadata in text format if message.meta and message.mime != "media": continue - + # Format the message formatted_message = _format_message_for_txt(message, contact) f.write(f"{formatted_message}\n") @@ -1068,16 +1095,16 @@ def create_txt(data, output): def _format_message_for_txt(message, contact): """Format a message for text output.""" date = datetime.fromtimestamp(message.timestamp).date() - + # Determine the sender name if message.from_me: name = "You" else: name = message.sender if message.sender else contact - + prefix = f"[{date} {message.time}] {name}: " prefix_length = len(prefix) - + # Handle different message types if message.media and ("/" in message.mime or message.mime == "media"): if message.data == "The media is missing": @@ -1089,9 +1116,9 @@ def _format_message_for_txt(message, contact): message_text = "" else: message_text = message.data.replace('
', f'\n{" " * prefix_length}') - + # Add caption if present if message.caption is not None: message_text += "\n" + ' ' * len(prefix) + message.caption.replace('
', f'\n{" " * prefix_length}') - + return f"{prefix}{message_text}" diff --git a/Whatsapp_Chat_Exporter/bplist.py b/Whatsapp_Chat_Exporter/bplist.py index 390fe6e..126dfc9 100644 --- a/Whatsapp_Chat_Exporter/bplist.py +++ b/Whatsapp_Chat_Exporter/bplist.py @@ -24,51 +24,19 @@ import struct import codecs from datetime import datetime, timedelta -class BPListWriter(object): - def __init__(self, objects): - self.bplist = "" - self.objects = objects - - def binary(self): - '''binary -> string - - Generates bplist - ''' - self.data = 'bplist00' - - # TODO: flatten objects and count max length size - - # TODO: write objects and save offsets - - # TODO: write offsets - - # TODO: write metadata - - return self.data - - def write(self, filename): - ''' - - Writes bplist to file - ''' - if self.bplist != "": - pass - # TODO: save self.bplist to file - else: - raise Exception('BPlist not yet generated') class BPListReader(object): def __init__(self, s): self.data = s self.objects = [] self.resolved = {} - + def __unpackIntStruct(self, sz, s): '''__unpackIntStruct(size, string) -> int - + Unpacks the integer of given size (1, 2 or 4 bytes) from string ''' - if sz == 1: + if sz == 1: ot = '!B' elif sz == 2: ot = '!H' @@ -79,17 +47,17 @@ class BPListReader(object): else: raise Exception('int unpack size '+str(sz)+' unsupported') return struct.unpack(ot, s)[0] - + def __unpackInt(self, offset): '''__unpackInt(offset) -> int - + Unpacks int field from plist at given offset ''' return self.__unpackIntMeta(offset)[1] def __unpackIntMeta(self, offset): '''__unpackIntMeta(offset) -> (size, int) - + Unpacks int field from plist at given offset and returns its size and value ''' obj_header = self.data[offset] @@ -99,7 +67,7 @@ class BPListReader(object): def __resolveIntSize(self, obj_info, offset): '''__resolveIntSize(obj_info, offset) -> (count, offset) - + Calculates count of objref* array entries and returns count and offset to first element ''' if obj_info == 0x0F: @@ -112,10 +80,10 @@ class BPListReader(object): def __unpackFloatStruct(self, sz, s): '''__unpackFloatStruct(size, string) -> float - + Unpacks the float of given size (4 or 8 bytes) from string ''' - if sz == 4: + if sz == 4: ot = '!f' elif sz == 8: ot = '!d' @@ -125,7 +93,7 @@ class BPListReader(object): def __unpackFloat(self, offset): '''__unpackFloat(offset) -> float - + Unpacks float field from plist at given offset ''' obj_header = self.data[offset] @@ -135,70 +103,79 @@ class BPListReader(object): def __unpackDate(self, offset): td = int(struct.unpack(">d", self.data[offset+1:offset+9])[0]) - return datetime(year=2001,month=1,day=1) + timedelta(seconds=td) + return datetime(year=2001, month=1, day=1) + timedelta(seconds=td) def __unpackItem(self, offset): '''__unpackItem(offset) - + Unpacks and returns an item from plist ''' obj_header = self.data[offset] obj_type, obj_info = (obj_header & 0xF0), (obj_header & 0x0F) - if obj_type == 0x00: - if obj_info == 0x00: # null 0000 0000 + if obj_type == 0x00: + if obj_info == 0x00: # null 0000 0000 return None - elif obj_info == 0x08: # bool 0000 1000 // false + elif obj_info == 0x08: # bool 0000 1000 // false return False - elif obj_info == 0x09: # bool 0000 1001 // true + elif obj_info == 0x09: # bool 0000 1001 // true return True - elif obj_info == 0x0F: # fill 0000 1111 // fill byte - raise Exception("0x0F Not Implemented") # this is really pad byte, FIXME + elif obj_info == 0x0F: # fill 0000 1111 // fill byte + raise Exception("0x0F Not Implemented") # this is really pad byte, FIXME else: - raise Exception('unpack item type '+str(obj_header)+' at '+str(offset)+ 'failed') - elif obj_type == 0x10: # int 0001 nnnn ... // # of bytes is 2^nnnn, big-endian bytes + raise Exception('unpack item type '+str(obj_header)+' at '+str(offset) + 'failed') + elif obj_type == 0x10: # int 0001 nnnn ... // # of bytes is 2^nnnn, big-endian bytes return self.__unpackInt(offset) - elif obj_type == 0x20: # real 0010 nnnn ... // # of bytes is 2^nnnn, big-endian bytes + elif obj_type == 0x20: # real 0010 nnnn ... // # of bytes is 2^nnnn, big-endian bytes return self.__unpackFloat(offset) - elif obj_type == 0x30: # date 0011 0011 ... // 8 byte float follows, big-endian bytes + elif obj_type == 0x30: # date 0011 0011 ... // 8 byte float follows, big-endian bytes return self.__unpackDate(offset) - elif obj_type == 0x40: # data 0100 nnnn [int] ... // nnnn is number of bytes unless 1111 then int count follows, followed by bytes + # data 0100 nnnn [int] ... // nnnn is number of bytes unless 1111 then int count follows, followed by bytes + elif obj_type == 0x40: obj_count, objref = self.__resolveIntSize(obj_info, offset) - return self.data[objref:objref+obj_count] # XXX: we return data as str - elif obj_type == 0x50: # string 0101 nnnn [int] ... // ASCII string, nnnn is # of chars, else 1111 then int count, then bytes + return self.data[objref:objref+obj_count] # XXX: we return data as str + # string 0101 nnnn [int] ... // ASCII string, nnnn is # of chars, else 1111 then int count, then bytes + elif obj_type == 0x50: obj_count, objref = self.__resolveIntSize(obj_info, offset) return self.data[objref:objref+obj_count] - elif obj_type == 0x60: # string 0110 nnnn [int] ... // Unicode string, nnnn is # of chars, else 1111 then int count, then big-endian 2-byte uint16_t + # string 0110 nnnn [int] ... // Unicode string, nnnn is # of chars, else 1111 then int count, then big-endian 2-byte uint16_t + elif obj_type == 0x60: obj_count, objref = self.__resolveIntSize(obj_info, offset) return self.data[objref:objref+obj_count*2].decode('utf-16be') - elif obj_type == 0x80: # uid 1000 nnnn ... // nnnn+1 is # of bytes + elif obj_type == 0x80: # uid 1000 nnnn ... // nnnn+1 is # of bytes # FIXME: Accept as a string for now obj_count, objref = self.__resolveIntSize(obj_info, offset) return self.data[objref:objref+obj_count] - elif obj_type == 0xA0: # array 1010 nnnn [int] objref* // nnnn is count, unless '1111', then int count follows + # array 1010 nnnn [int] objref* // nnnn is count, unless '1111', then int count follows + elif obj_type == 0xA0: obj_count, objref = self.__resolveIntSize(obj_info, offset) arr = [] for i in range(obj_count): - arr.append(self.__unpackIntStruct(self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) + arr.append(self.__unpackIntStruct( + self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) return arr - elif obj_type == 0xC0: # set 1100 nnnn [int] objref* // nnnn is count, unless '1111', then int count follows + # set 1100 nnnn [int] objref* // nnnn is count, unless '1111', then int count follows + elif obj_type == 0xC0: # XXX: not serializable via apple implementation - raise Exception("0xC0 Not Implemented") # FIXME: implement - elif obj_type == 0xD0: # dict 1101 nnnn [int] keyref* objref* // nnnn is count, unless '1111', then int count follows + raise Exception("0xC0 Not Implemented") # FIXME: implement + # dict 1101 nnnn [int] keyref* objref* // nnnn is count, unless '1111', then int count follows + elif obj_type == 0xD0: obj_count, objref = self.__resolveIntSize(obj_info, offset) keys = [] for i in range(obj_count): - keys.append(self.__unpackIntStruct(self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) + keys.append(self.__unpackIntStruct( + self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) values = [] objref += obj_count*self.object_ref_size for i in range(obj_count): - values.append(self.__unpackIntStruct(self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) + values.append(self.__unpackIntStruct( + self.object_ref_size, self.data[objref+i*self.object_ref_size:objref+i*self.object_ref_size+self.object_ref_size])) dic = {} for i in range(obj_count): dic[keys[i]] = values[i] return dic else: raise Exception('don\'t know how to unpack obj type '+hex(obj_type)+' at '+str(offset)) - + def __resolveObject(self, idx): try: return self.resolved[idx] @@ -212,7 +189,7 @@ class BPListReader(object): return newArr if type(obj) == dict: newDic = {} - for k,v in obj.items(): + for k, v in obj.items(): key_resolved = self.__resolveObject(k) if isinstance(key_resolved, str): rk = key_resolved @@ -225,15 +202,16 @@ class BPListReader(object): else: self.resolved[idx] = obj return obj - + def parse(self): # read header if self.data[:8] != b'bplist00': raise Exception('Bad magic') - + # read trailer - self.offset_size, self.object_ref_size, self.number_of_objects, self.top_object, self.table_offset = struct.unpack('!6xBB4xI4xI4xI', self.data[-32:]) - #print "** plist offset_size:",self.offset_size,"objref_size:",self.object_ref_size,"num_objs:",self.number_of_objects,"top:",self.top_object,"table_ofs:",self.table_offset + self.offset_size, self.object_ref_size, self.number_of_objects, self.top_object, self.table_offset = struct.unpack( + '!6xBB4xI4xI4xI', self.data[-32:]) + # print "** plist offset_size:",self.offset_size,"objref_size:",self.object_ref_size,"num_objs:",self.number_of_objects,"top:",self.top_object,"table_ofs:",self.table_offset # read offset table self.offset_table = self.data[self.table_offset:-32] @@ -243,50 +221,25 @@ class BPListReader(object): offset_entry = ot[:self.offset_size] ot = ot[self.offset_size:] self.offsets.append(self.__unpackIntStruct(self.offset_size, offset_entry)) - #print "** plist offsets:",self.offsets - + # print "** plist offsets:",self.offsets + # read object table self.objects = [] k = 0 for i in self.offsets: obj = self.__unpackItem(i) - #print "** plist unpacked",k,type(obj),obj,"at",i + # print "** plist unpacked",k,type(obj),obj,"at",i k += 1 self.objects.append(obj) - + # rebuild object tree - #for i in range(len(self.objects)): + # for i in range(len(self.objects)): # self.__resolveObject(i) - + # return root object return self.__resolveObject(self.top_object) - + @classmethod def plistWithString(cls, s): parser = cls(s) return parser.parse() - -# helpers for testing -def plist(obj): - from Foundation import NSPropertyListSerialization, NSPropertyListBinaryFormat_v1_0 - b = NSPropertyListSerialization.dataWithPropertyList_format_options_error_(obj, NSPropertyListBinaryFormat_v1_0, 0, None) - return str(b.bytes()) - -def unplist(s): - from Foundation import NSData, NSPropertyListSerialization - d = NSData.dataWithBytes_length_(s, len(s)) - return NSPropertyListSerialization.propertyListWithData_options_format_error_(d, 0, None, None) - -if __name__ == "__main__": - import os - import sys - import json - file_path = sys.argv[1] - - with open(file_path, "rb") as fp: - data = fp.read() - - out = BPListReader(data).parse() - - with open(file_path + ".json", "w") as fp: - json.dump(out, indent=4) diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index e84154d..8747419 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -7,6 +7,7 @@ class Timing: """ Handles timestamp formatting with timezone support. """ + def __init__(self, timezone_offset: Optional[int]) -> None: """ Initialize Timing object. @@ -27,7 +28,7 @@ class Timing: Returns: Optional[str]: Formatted timestamp string, or None if timestamp is None """ - if timestamp: + if timestamp is not None: timestamp = timestamp / 1000 if timestamp > 9999999999 else timestamp return datetime.fromtimestamp(timestamp, TimeZone(self.timezone_offset)).strftime(format) return None @@ -37,6 +38,7 @@ class TimeZone(tzinfo): """ Custom timezone class with fixed offset. """ + def __init__(self, offset: int) -> None: """ Initialize TimeZone object. @@ -151,6 +153,7 @@ class ChatStore: """ Stores chat information and messages. """ + def __init__(self, type: str, name: Optional[str] = None, media: Optional[str] = None) -> None: """ Initialize ChatStore object. @@ -159,7 +162,7 @@ class ChatStore: type (str): Device type (IOS or ANDROID) name (Optional[str]): Chat name media (Optional[str]): Path to media folder - + Raises: TypeError: If name is not a string or None """ @@ -182,7 +185,7 @@ class ChatStore: self.their_avatar_thumb = None self.status = None self.media_base = "" - + def __len__(self) -> int: """Get number of chats. Required for dict-like access.""" return len(self._messages) @@ -192,7 +195,7 @@ class ChatStore: if not isinstance(message, Message): raise TypeError("message must be a Message object") self._messages[id] = message - + def get_message(self, id: str) -> 'Message': """Get a message from the chat store.""" return self._messages.get(id) @@ -204,20 +207,30 @@ class ChatStore: def to_json(self) -> Dict[str, Any]: """Convert chat store to JSON-serializable dict.""" - return { - 'name': self.name, - 'type': self.type, - 'my_avatar': self.my_avatar, - 'their_avatar': self.their_avatar, - 'their_avatar_thumb': self.their_avatar_thumb, - 'status': self.status, - 'messages': {id: msg.to_json() for id, msg in self._messages.items()} + json_dict = { + key: value + for key, value in self.__dict__.items() + if key != '_messages' } + json_dict['messages'] = {id: msg.to_json() for id, msg in self._messages.items()} + return json_dict + + @classmethod + def from_json(cls, data: Dict) -> 'ChatStore': + """Create a chat store from JSON data.""" + chat = cls(data.get("type"), data.get("name")) + for key, value in data.items(): + if hasattr(chat, key) and key not in ("messages", "type", "name"): + setattr(chat, key, value) + for id, msg_data in data.get("messages", {}).items(): + message = Message.from_json(msg_data) + chat.add_message(id, message) + return chat def get_last_message(self) -> 'Message': """Get the most recent message in the chat.""" return tuple(self._messages.values())[-1] - + def items(self): """Get message items pairs.""" return self._messages.items() @@ -230,20 +243,42 @@ class ChatStore: """Get all message keys in the chat.""" return self._messages.keys() + def merge_with(self, other: 'ChatStore'): + """Merge another ChatStore into this one. + + Args: + other (ChatStore): The ChatStore to merge with + + """ + if not isinstance(other, ChatStore): + raise TypeError("Can only merge with another ChatStore object") + + # Update fields if they are not None in the other ChatStore + self.name = other.name or self.name + self.type = other.type or self.type + self.my_avatar = other.my_avatar or self.my_avatar + self.their_avatar = other.their_avatar or self.their_avatar + self.their_avatar_thumb = other.their_avatar_thumb or self.their_avatar_thumb + self.status = other.status or self.status + + # Merge messages + self._messages.update(other._messages) + class Message: """ Represents a single message in a chat. """ + def __init__( self, *, from_me: Union[bool, int], timestamp: int, time: Union[int, float, str], - key_id: int, - received_timestamp: int, - read_timestamp: int, + key_id: Union[int, str], + received_timestamp: int = None, + read_timestamp: int = None, timezone_offset: int = 0, message_type: Optional[int] = None ) -> None: @@ -255,8 +290,8 @@ class Message: timestamp (int): Message timestamp time (Union[int, float, str]): Message time key_id (int): Message unique identifier - received_timestamp (int): When message was received - read_timestamp (int): When message was read + received_timestamp (int, optional): When message was received. Defaults to None + read_timestamp (int, optional): When message was read. Defaults to None timezone_offset (int, optional): Hours offset from UTC. Defaults to 0 message_type (Optional[int], optional): Type of message. Defaults to None @@ -266,7 +301,7 @@ class Message: self.from_me = bool(from_me) self.timestamp = timestamp / 1000 if timestamp > 9999999999 else timestamp timing = Timing(timezone_offset) - + if isinstance(time, (int, float)): self.time = timing.format_timestamp(self.timestamp, "%H:%M") elif isinstance(time, str): @@ -281,10 +316,22 @@ class Message: self.sender = None self.safe = False self.mime = None - self.message_type = message_type, - self.received_timestamp = timing.format_timestamp(received_timestamp, "%Y/%m/%d %H:%M") - self.read_timestamp = timing.format_timestamp(read_timestamp, "%Y/%m/%d %H:%M") - + self.message_type = message_type + if isinstance(received_timestamp, (int, float)): + self.received_timestamp = timing.format_timestamp( + received_timestamp, "%Y/%m/%d %H:%M") + elif isinstance(received_timestamp, str): + self.received_timestamp = received_timestamp + else: + self.received_timestamp = None + if isinstance(read_timestamp, (int, float)): + self.read_timestamp = timing.format_timestamp( + read_timestamp, "%Y/%m/%d %H:%M") + elif isinstance(read_timestamp, str): + self.read_timestamp = read_timestamp + else: + self.read_timestamp = None + # Extra attributes self.reply = None self.quoted_data = None @@ -295,19 +342,24 @@ class Message: def to_json(self) -> Dict[str, Any]: """Convert message to JSON-serializable dict.""" return { - 'from_me': self.from_me, - 'timestamp': self.timestamp, - 'time': self.time, - 'media': self.media, - 'key_id': self.key_id, - 'meta': self.meta, - 'data': self.data, - 'sender': self.sender, - 'safe': self.safe, - 'mime': self.mime, - 'reply': self.reply, - 'quoted_data': self.quoted_data, - 'caption': self.caption, - 'thumb': self.thumb, - 'sticker': self.sticker - } \ No newline at end of file + key: value + for key, value in self.__dict__.items() + } + + @classmethod + def from_json(cls, data: Dict) -> 'Message': + message = cls( + from_me=data["from_me"], + timestamp=data["timestamp"], + time=data["time"], + key_id=data["key_id"], + message_type=data.get("message_type"), + received_timestamp=data.get("received_timestamp"), + read_timestamp=data.get("read_timestamp") + ) + added = ("from_me", "timestamp", "time", "key_id", "message_type", + "received_timestamp", "read_timestamp") + for key, value in data.items(): + if hasattr(message, key) and key not in added: + setattr(message, key, value) + return message diff --git a/Whatsapp_Chat_Exporter/exported_handler.py b/Whatsapp_Chat_Exporter/exported_handler.py index 7215f6f..9e53c23 100644 --- a/Whatsapp_Chat_Exporter/exported_handler.py +++ b/Whatsapp_Chat_Exporter/exported_handler.py @@ -1,21 +1,25 @@ #!/usr/bin/python3 import os +import logging from datetime import datetime from mimetypes import MimeTypes from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import Device +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, Device + + +logger = logging.getLogger(__name__) def messages(path, data, assume_first_as_me=False): """ Extracts messages from an exported WhatsApp chat file. - + Args: path: Path to the exported chat file data: Data container object to store the parsed chat assume_first_as_me: If True, assumes the first message is sent from the user without asking - + Returns: Updated data container with extracted messages """ @@ -23,55 +27,55 @@ def messages(path, data, assume_first_as_me=False): chat = data.add_chat("ExportedChat", ChatStore(Device.EXPORTED)) you = "" # Will store the username of the current user user_identification_done = False # Flag to track if user identification has been done - + # First pass: count total lines for progress reporting with open(path, "r", encoding="utf8") as file: total_row_number = sum(1 for _ in file) - + # Second pass: process the messages with open(path, "r", encoding="utf8") as file: for index, line in enumerate(file): you, user_identification_done = process_line( - line, index, chat, path, you, + line, index, chat, path, you, assume_first_as_me, user_identification_done ) # Show progress if index % 1000 == 0: - print(f"Processing messages & media...({index}/{total_row_number})", end="\r") + logger.info(f"Processing messages & media...({index}/{total_row_number})\r") - print(f"Processing messages & media...({total_row_number}/{total_row_number})") + logger.info(f"Processed {total_row_number} messages & media{CLEAR_LINE}") return data def process_line(line, index, chat, file_path, you, assume_first_as_me, user_identification_done): """ Process a single line from the chat file - + Returns: Tuple of (updated_you_value, updated_user_identification_done_flag) """ parts = line.split(" - ", 1) - + # Check if this is a new message (has timestamp format) if len(parts) > 1: time = parts[0] you, user_identification_done = process_new_message( - time, parts[1], index, chat, you, file_path, + time, parts[1], index, chat, you, file_path, assume_first_as_me, user_identification_done ) else: # This is a continuation of the previous message process_message_continuation(line, index, chat) - + return you, user_identification_done -def process_new_message(time, content, index, chat, you, file_path, +def process_new_message(time, content, index, chat, you, file_path, assume_first_as_me, user_identification_done): """ Process a line that contains a new message - + Returns: Tuple of (updated_you_value, updated_user_identification_done_flag) """ @@ -84,7 +88,7 @@ def process_new_message(time, content, index, chat, you, file_path, received_timestamp=None, read_timestamp=None ) - + # Check if this is a system message (no name:message format) if ":" not in content: msg.data = content @@ -92,7 +96,7 @@ def process_new_message(time, content, index, chat, you, file_path, else: # Process user message name, message = content.strip().split(":", 1) - + # Handle user identification if you == "": if chat.name is None: @@ -109,17 +113,17 @@ def process_new_message(time, content, index, chat, you, file_path, # If we know the chat name, anyone else must be "you" if name != chat.name: you = name - + # Set the chat name if needed if chat.name is None and name != you: chat.name = name - + # Determine if this message is from the current user msg.from_me = (name == you) - + # Process message content process_message_content(msg, message, file_path) - + chat.add_message(index, msg) return you, user_identification_done @@ -140,11 +144,11 @@ def process_attached_file(msg, message, file_path): """Process an attached file in a message""" mime = MimeTypes() msg.media = True - + # Extract file path and check if it exists file_name = message.split("(file attached)")[0].strip() attached_file_path = os.path.join(os.path.dirname(file_path), file_name) - + if os.path.isfile(attached_file_path): msg.data = attached_file_path guess = mime.guess_type(attached_file_path)[0] @@ -161,9 +165,9 @@ def process_message_continuation(line, index, chat): lookback = index - 1 while lookback not in chat.keys(): lookback -= 1 - + msg = chat.get_message(lookback) - + # Add the continuation line to the message if msg.media: msg.caption = line.strip() @@ -178,4 +182,4 @@ def prompt_for_user_identification(name): if ans == "y": return name elif ans == "n": - return "" \ No newline at end of file + return "" diff --git a/Whatsapp_Chat_Exporter/ios_handler.py b/Whatsapp_Chat_Exporter/ios_handler.py index 7a15835..0501ac5 100644 --- a/Whatsapp_Chat_Exporter/ios_handler.py +++ b/Whatsapp_Chat_Exporter/ios_handler.py @@ -1,14 +1,18 @@ #!/usr/bin/python3 import os +import logging import shutil from glob import glob from pathlib import Path from mimetypes import MimeTypes from markupsafe import escape as htmle from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CURRENT_TZ_OFFSET, get_chat_condition -from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, slugify, Device +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition +from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name, Device + + +logger = logging.getLogger(__name__) def contacts(db, data): @@ -16,26 +20,27 @@ def contacts(db, data): c = db.cursor() c.execute("""SELECT count() FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") total_row_number = c.fetchone()[0] - print(f"Pre-processing contacts...({total_row_number})") - + logger.info(f"Pre-processing contacts...({total_row_number})\r") + c.execute("""SELECT ZWHATSAPPID, ZABOUTTEXT FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") content = c.fetchone() while content is not None: zwhatsapp_id = content["ZWHATSAPPID"] if not zwhatsapp_id.endswith("@s.whatsapp.net"): zwhatsapp_id += "@s.whatsapp.net" - + current_chat = ChatStore(Device.IOS) current_chat.status = content["ZABOUTTEXT"] data.add_chat(zwhatsapp_id, current_chat) content = c.fetchone() + logger.info(f"Pre-processed {total_row_number} contacts{CLEAR_LINE}") def process_contact_avatars(current_chat, media_folder, contact_id): """Process and assign avatar images for a contact.""" path = f'{media_folder}/Media/Profile/{contact_id.split("@")[0]}' avatars = glob(f"{path}*") - + if 0 < len(avatars) <= 1: current_chat.their_avatar = avatars[0] else: @@ -55,16 +60,18 @@ def get_contact_name(content): return content["ZPUSHNAME"] -def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, filter_empty): +def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, filter_empty, no_reply): """Process WhatsApp messages and contacts from the database.""" c = db.cursor() cursor2 = db.cursor() - + # Build the chat filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") date_filter = f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else '' - + # Process contacts first contact_query = f""" SELECT count() @@ -85,7 +92,7 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """ c.execute(contact_query) total_row_number = c.fetchone()[0] - print(f"Processing contacts...({total_row_number})") + logger.info(f"Processing contacts...({total_row_number})\r") # Get distinct contacts contacts_query = f""" @@ -105,13 +112,13 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, GROUP BY ZCONTACTJID; """ c.execute(contacts_query) - + # Process each contact content = c.fetchone() while content is not None: contact_name = get_contact_name(content) contact_id = content["ZCONTACTJID"] - + # Add or update chat if contact_id not in data: current_chat = data.add_chat(contact_id, ChatStore(Device.IOS, contact_name, media_folder)) @@ -119,11 +126,13 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, current_chat = data.get_chat(contact_id) current_chat.name = contact_name current_chat.my_avatar = os.path.join(media_folder, "Media/Profile/Photo.jpg") - + # Process avatar images process_contact_avatars(current_chat, media_folder, contact_id) content = c.fetchone() + logger.info(f"Processed {total_row_number} contacts{CLEAR_LINE}") + # Get message count message_count_query = f""" SELECT count() @@ -139,8 +148,8 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """ c.execute(message_count_query) total_row_number = c.fetchone()[0] - print(f"Processing messages...(0/{total_row_number})", end="\r") - + logger.info(f"Processing messages...(0/{total_row_number})\r") + # Fetch messages messages_query = f""" SELECT ZCONTACTJID, @@ -168,7 +177,7 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, ORDER BY ZMESSAGEDATE ASC; """ c.execute(messages_query) - + # Process each message i = 0 content = c.fetchone() @@ -176,14 +185,14 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, contact_id = content["ZCONTACTJID"] message_pk = content["Z_PK"] is_group_message = content["ZGROUPINFO"] is not None - + # Ensure chat exists if contact_id not in data: current_chat = data.add_chat(contact_id, ChatStore(Device.IOS)) process_contact_avatars(current_chat, media_folder, contact_id) else: current_chat = data.get_chat(contact_id) - + # Create message object ts = APPLE_TIME + content["ZMESSAGEDATE"] message = Message( @@ -196,24 +205,23 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, received_timestamp=APPLE_TIME + content["ZSENTDATE"] if content["ZSENTDATE"] else None, read_timestamp=None # TODO: Add timestamp ) - + # Process message data - invalid = process_message_data(message, content, is_group_message, data, cursor2) - + invalid = process_message_data(message, content, is_group_message, data, cursor2, no_reply) + # Add valid messages to chat if not invalid: current_chat.add_message(message_pk, message) - + # Update progress i += 1 if i % 1000 == 0: - print(f"Processing messages...({i}/{total_row_number})", end="\r") + logger.info(f"Processing messages...({i}/{total_row_number})\r") content = c.fetchone() - - print(f"Processing messages...({total_row_number}/{total_row_number})", end="\r") + logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}") -def process_message_data(message, content, is_group_message, data, cursor2): +def process_message_data(message, content, is_group_message, data, cursor2, no_reply): """Process and set message data from content row.""" # Handle group sender info if is_group_message and content["ZISFROMME"] == 0: @@ -230,13 +238,13 @@ def process_message_data(message, content, is_group_message, data, cursor2): message.sender = name or fallback else: message.sender = None - + # Handle metadata messages if content["ZMESSAGETYPE"] == 6: return process_metadata_message(message, content, is_group_message) - + # Handle quoted replies - if content["ZMETADATA"] is not None and content["ZMETADATA"].startswith(b"\x2a\x14") and False: + if content["ZMETADATA"] is not None and content["ZMETADATA"].startswith(b"\x2a\x14") and not no_reply: quoted = content["ZMETADATA"][2:19] message.reply = quoted.decode() cursor2.execute(f"""SELECT ZTEXT @@ -244,17 +252,17 @@ def process_message_data(message, content, is_group_message, data, cursor2): WHERE ZSTANZAID LIKE '{message.reply}%'""") quoted_content = cursor2.fetchone() if quoted_content and "ZTEXT" in quoted_content: - message.quoted_data = quoted_content["ZTEXT"] + message.quoted_data = quoted_content["ZTEXT"] else: message.quoted_data = None - + # Handle stickers if content["ZMESSAGETYPE"] == 15: message.sticker = True # Process message text process_message_text(message, content) - + return False # Message is valid @@ -299,19 +307,21 @@ def process_message_text(message, content): msg = content["ZTEXT"] if msg is not None: msg = msg.replace("\r\n", "
").replace("\n", "
") - + message.data = msg def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=False): """Process media files from WhatsApp messages.""" c = db.cursor() - + # Build filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID","ZMEMBERJID"], "ZGROUPINFO", "ios") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["ZWACHATSESSION.ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") date_filter = f'AND ZMESSAGEDATE {filter_date}' if filter_date is not None else '' - + # Get media count media_count_query = f""" SELECT count() @@ -329,8 +339,8 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa """ c.execute(media_count_query) total_row_number = c.fetchone()[0] - print(f"\nProcessing media...(0/{total_row_number})", end="\r") - + logger.info(f"Processing media...(0/{total_row_number})\r") + # Fetch media items media_query = f""" SELECT ZCONTACTJID, @@ -354,21 +364,20 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa ORDER BY ZCONTACTJID ASC """ c.execute(media_query) - + # Process each media item mime = MimeTypes() i = 0 content = c.fetchone() while content is not None: process_media_item(content, data, media_folder, mime, separate_media) - + # Update progress i += 1 if i % 100 == 0: - print(f"Processing media...({i}/{total_row_number})", end="\r") + logger.info(f"Processing media...({i}/{total_row_number})\r") content = c.fetchone() - - print(f"Processing media...({total_row_number}/{total_row_number})", end="\r") + logger.info(f"Processed {total_row_number} media{CLEAR_LINE}") def process_media_item(content, data, media_folder, mime, separate_media): @@ -377,23 +386,24 @@ def process_media_item(content, data, media_folder, mime, separate_media): current_chat = data.get_chat(content["ZCONTACTJID"]) message = current_chat.get_message(content["ZMESSAGE"]) message.media = True - + if current_chat.media_base == "": current_chat.media_base = media_folder + "/" - + if os.path.isfile(file_path): message.data = '/'.join(file_path.split("/")[1:]) - + # Set MIME type if content["ZVCARDSTRING"] is None: guess = mime.guess_type(file_path)[0] message.mime = guess if guess is not None else "application/octet-stream" else: message.mime = content["ZVCARDSTRING"] - + # Handle separate media option if separate_media: - chat_display_name = slugify(current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0], True) + chat_display_name = safe_name( + current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0]) current_filename = file_path.split("/")[-1] new_folder = os.path.join(media_folder, "separated", chat_display_name) Path(new_folder).mkdir(parents=True, exist_ok=True) @@ -405,7 +415,7 @@ def process_media_item(content, data, media_folder, mime, separate_media): message.data = "The media is missing" message.mime = "media" message.meta = True - + # Add caption if available if content["ZTITLE"] is not None: message.caption = content["ZTITLE"] @@ -414,12 +424,14 @@ def process_media_item(content, data, media_folder, mime, separate_media): def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): """Process vCard contacts from WhatsApp messages.""" c = db.cursor() - + # Build filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["ZCONTACTJID", "ZMEMBERJID"], "ZGROUPINFO", "ios") date_filter = f'AND ZWAMESSAGE.ZMESSAGEDATE {filter_date}' if filter_date is not None else '' - + # Fetch vCard mentions vcard_query = f""" SELECT DISTINCT ZWAVCARDMENTION.ZMEDIAITEM, @@ -444,8 +456,8 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): c.execute(vcard_query) contents = c.fetchall() total_row_number = len(contents) - print(f"\nProcessing vCards...(0/{total_row_number})", end="\r") - + logger.info(f"Processing vCards...(0/{total_row_number})\r") + # Create vCards directory path = f'{media_folder}/Message/vCards' Path(path).mkdir(parents=True, exist_ok=True) @@ -453,7 +465,8 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): # Process each vCard for index, content in enumerate(contents): process_vcard_item(content, path, data) - print(f"Processing vCards...({index + 1}/{total_row_number})", end="\r") + logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r") + logger.info(f"Processed {total_row_number} vCards{CLEAR_LINE}") def process_vcard_item(content, path, data): @@ -478,9 +491,10 @@ def process_vcard_item(content, path, data): f.write(vcard_string) # Create vCard summary and update message - vcard_summary = "This media include the following vCard file(s):
" - vcard_summary += " | ".join([f'{htmle(name)}' for name, fp in zip(vcard_names, file_paths)]) - + vcard_summary = "This media include the following vCard file(s):
" + vcard_summary += " | ".join([f'{htmle(name)}' for name, + fp in zip(vcard_names, file_paths)]) + message = data.get_chat(content["ZCONTACTJID"]).get_message(content["ZMESSAGE"]) message.data = vcard_summary message.mime = "text/x-vcard" @@ -492,11 +506,13 @@ def process_vcard_item(content, path, data): def calls(db, data, timezone_offset, filter_chat): """Process WhatsApp call records.""" c = db.cursor() - + # Build filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") - + chat_filter_include = get_chat_condition( + filter_chat[0], True, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") + chat_filter_exclude = get_chat_condition( + filter_chat[1], False, ["ZGROUPCALLCREATORUSERJIDSTRING"], None, "ios") + # Get call count call_count_query = f""" SELECT count() @@ -509,9 +525,9 @@ def calls(db, data, timezone_offset, filter_chat): total_row_number = c.fetchone()[0] if total_row_number == 0: return - - print(f"\nProcessing calls...({total_row_number})", end="\r") - + + logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}\n") + # Fetch call records calls_query = f""" SELECT ZCALLIDSTRING, @@ -532,16 +548,16 @@ def calls(db, data, timezone_offset, filter_chat): {chat_filter_exclude} """ c.execute(calls_query) - + # Create calls chat chat = ChatStore(Device.ANDROID, "WhatsApp Calls") - + # Process each call content = c.fetchone() while content is not None: process_call_record(content, chat, data, timezone_offset) content = c.fetchone() - + # Add calls chat to data data.add_chat("000000000000000", chat) @@ -556,7 +572,7 @@ def process_call_record(content, chat, data, timezone_offset): key_id=content["ZCALLIDSTRING"], timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET ) - + # Set sender info _jid = content["ZGROUPCALLCREATORUSERJIDSTRING"] name = data.get_chat(_jid).name if _jid in data else None @@ -565,11 +581,11 @@ def process_call_record(content, chat, data, timezone_offset): else: fallback = None call.sender = name or fallback - + # Set call metadata call.meta = True call.data = format_call_data(call, content) - + # Add call to chat chat.add_message(call.key_id, call) @@ -583,7 +599,7 @@ def format_call_data(call, content): f"call {'to' if call.from_me else 'from'} " f"{call.sender} was " ) - + # Call outcome if content['ZOUTCOME'] in (1, 4): call_data += "not answered." if call.from_me else "missed." @@ -598,5 +614,5 @@ def format_call_data(call, content): ) else: call_data += "in an unknown state." - - return call_data \ No newline at end of file + + return call_data diff --git a/Whatsapp_Chat_Exporter/ios_media_handler.py b/Whatsapp_Chat_Exporter/ios_media_handler.py index a1dcd30..4416727 100644 --- a/Whatsapp_Chat_Exporter/ios_media_handler.py +++ b/Whatsapp_Chat_Exporter/ios_media_handler.py @@ -1,11 +1,12 @@ #!/usr/bin/python3 +import logging import shutil import sqlite3 import os import getpass -from sys import exit -from Whatsapp_Chat_Exporter.utility import WhatsAppIdentifier +from sys import exit, platform as osname +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, WhatsAppIdentifier from Whatsapp_Chat_Exporter.bplist import BPListReader try: from iphone_backup_decrypt import EncryptedBackup, RelativePath @@ -15,6 +16,9 @@ else: support_encrypted = True +logger = logging.getLogger(__name__) + + class BackupExtractor: """ A class to handle the extraction of WhatsApp data from iOS backups, @@ -42,27 +46,38 @@ class BackupExtractor: Returns: bool: True if encrypted, False otherwise. """ - with sqlite3.connect(os.path.join(self.base_dir, "Manifest.db")) as db: - c = db.cursor() - try: - c.execute("SELECT count() FROM Files") - c.fetchone() # Execute and fetch to trigger potential errors - except (sqlite3.OperationalError, sqlite3.DatabaseError): - return True + try: + with sqlite3.connect(os.path.join(self.base_dir, "Manifest.db")) as db: + c = db.cursor() + try: + c.execute("SELECT count() FROM Files") + c.fetchone() # Execute and fetch to trigger potential errors + except (sqlite3.OperationalError, sqlite3.DatabaseError): + return True + else: + return False + except sqlite3.DatabaseError as e: + if str(e) == "authorization denied" and osname == "darwin": + logger.error( + "You don't have permission to access the backup database. Please" + "check your permissions or try moving the backup to somewhere else." + ) + exit(8) else: - return False + raise e def _extract_encrypted_backup(self): """ Handles the extraction of data from an encrypted iOS backup. """ if not support_encrypted: - print("You don't have the dependencies to handle encrypted backup.") - print("Read more on how to deal with encrypted backup:") - print("https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage") + logger.error("You don't have the dependencies to handle encrypted backup." + "Read more on how to deal with encrypted backup:" + "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/blob/main/README.md#usage" + ) return - print("Encryption detected on the backup!") + logger.info(f"Encryption detected on the backup!{CLEAR_LINE}") password = getpass.getpass("Enter the password for the backup:") self._decrypt_backup(password) self._extract_decrypted_files() @@ -74,7 +89,7 @@ class BackupExtractor: Args: password (str): The password for the encrypted backup. """ - print("Trying to decrypt the iOS backup...", end="") + logger.info(f"Trying to decrypt the iOS backup...{CLEAR_LINE}") self.backup = EncryptedBackup( backup_directory=self.base_dir, passphrase=password, @@ -82,7 +97,8 @@ class BackupExtractor: check_same_thread=False, decrypt_chunk_size=self.decrypt_chunk_size, ) - print("Done\nDecrypting WhatsApp database...", end="") + logger.info(f"iOS backup decrypted successfully{CLEAR_LINE}") + logger.info("Decrypting WhatsApp database...\r") try: self.backup.extract_file( relative_path=RelativePath.WHATSAPP_MESSAGES, @@ -100,23 +116,23 @@ class BackupExtractor: output_filename=self.identifiers.CALL, ) except ValueError: - print("Failed to decrypt backup: incorrect password?") + logger.error("Failed to decrypt backup: incorrect password?") exit(7) except FileNotFoundError: - print( + logger.error( "Essential WhatsApp files are missing from the iOS backup. " "Perhapse you enabled end-to-end encryption for the backup? " "See https://wts.knugi.dev/docs.html?dest=iose2e" ) exit(6) else: - print("Done") - + logger.info(f"WhatsApp database decrypted successfully{CLEAR_LINE}") + def _extract_decrypted_files(self): """Extract all WhatsApp files after decryption""" def extract_progress_handler(file_id, domain, relative_path, n, total_files): if n % 100 == 0: - print(f"Decrypting and extracting files...({n}/{total_files})", end="\r") + logger.info(f"Decrypting and extracting files...({n}/{total_files})\r") return True self.backup.extract_files( @@ -125,7 +141,7 @@ class BackupExtractor: preserve_folders=True, filter_callback=extract_progress_handler ) - print(f"All required files are decrypted and extracted. ", end="\n") + logger.info(f"All required files are decrypted and extracted.{CLEAR_LINE}") def _extract_unencrypted_backup(self): """ @@ -144,10 +160,10 @@ class BackupExtractor: if not os.path.isfile(wts_db_path): if self.identifiers is WhatsAppIdentifier: - print("WhatsApp database not found.") + logger.error("WhatsApp database not found.") else: - print("WhatsApp Business database not found.") - print( + logger.error("WhatsApp Business database not found.") + logger.error( "Essential WhatsApp files are missing from the iOS backup. " "Perhapse you enabled end-to-end encryption for the backup? " "See https://wts.knugi.dev/docs.html?dest=iose2e" @@ -157,12 +173,12 @@ class BackupExtractor: shutil.copyfile(wts_db_path, self.identifiers.MESSAGE) if not os.path.isfile(contact_db_path): - print("Contact database not found. Skipping...") + logger.warning(f"Contact database not found. Skipping...{CLEAR_LINE}") else: shutil.copyfile(contact_db_path, self.identifiers.CONTACT) if not os.path.isfile(call_db_path): - print("Call database not found. Skipping...") + logger.warning(f"Call database not found. Skipping...{CLEAR_LINE}") else: shutil.copyfile(call_db_path, self.identifiers.CALL) @@ -176,7 +192,7 @@ class BackupExtractor: c = manifest.cursor() c.execute(f"SELECT count() FROM Files WHERE domain = '{_wts_id}'") total_row_number = c.fetchone()[0] - print(f"Extracting WhatsApp files...(0/{total_row_number})", end="\r") + logger.info(f"Extracting WhatsApp files...(0/{total_row_number})\r") c.execute( f""" SELECT fileID, relativePath, flags, file AS metadata, @@ -213,9 +229,9 @@ class BackupExtractor: os.utime(destination, (modification, modification)) if row["_index"] % 100 == 0: - print(f"Extracting WhatsApp files...({row['_index']}/{total_row_number})", end="\r") + logger.info(f"Extracting WhatsApp files...({row['_index']}/{total_row_number})\r") row = c.fetchone() - print(f"Extracting WhatsApp files...({total_row_number}/{total_row_number})", end="\n") + logger.info(f"Extracted WhatsApp files...({total_row_number}){CLEAR_LINE}") def extract_media(base_dir, identifiers, decrypt_chunk_size): @@ -229,4 +245,3 @@ def extract_media(base_dir, identifiers, decrypt_chunk_size): """ extractor = BackupExtractor(base_dir, identifiers, decrypt_chunk_size) extractor.extract() - diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index 16e927f..a147dfb 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -1,22 +1,26 @@ +import logging import sqlite3 import jinja2 import json import os import unicodedata import re +import string import math +import shutil from bleach import clean as sanitize from markupsafe import Markup from datetime import datetime, timedelta from enum import IntEnum -from Whatsapp_Chat_Exporter.data_model import ChatStore -from typing import Dict, List, Optional, Tuple +from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore, Timing +from typing import Dict, List, Optional, Tuple, Union try: from enum import StrEnum, IntEnum except ImportError: # < Python 3.11 - # This should be removed when the support for Python 3.10 ends. + # This should be removed when the support for Python 3.10 ends. (31 Oct 2026) from enum import Enum + class StrEnum(str, Enum): pass @@ -26,6 +30,9 @@ except ImportError: MAX_SIZE = 4 * 1024 * 1024 # Default 4MB ROW_SIZE = 0x3D0 CURRENT_TZ_OFFSET = datetime.now().astimezone().utcoffset().seconds / 3600 +CLEAR_LINE = "\x1b[K\n" + +logger = logging.getLogger(__name__) def convert_time_unit(time_second: int) -> str: @@ -37,23 +44,31 @@ def convert_time_unit(time_second: int) -> str: Returns: str: A human-readable string representing the time duration. """ - time = str(timedelta(seconds=time_second)) - if "day" not in time: - if time_second < 1: - time = "less than a second" - elif time_second == 1: - time = "a second" - elif time_second < 60: - time = time[5:][1 if time_second < 10 else 0:] + " seconds" - elif time_second == 60: - time = "a minute" - elif time_second < 3600: - time = time[2:] + " minutes" - elif time_second == 3600: - time = "an hour" - else: - time += " hour" - return time + if time_second < 1: + return "less than a second" + elif time_second == 1: + return "a second" + + delta = timedelta(seconds=time_second) + parts = [] + + days = delta.days + if days > 0: + parts.append(f"{days} day{'s' if days > 1 else ''}") + + hours = delta.seconds // 3600 + if hours > 0: + parts.append(f"{hours} hour{'s' if hours > 1 else ''}") + + minutes = (delta.seconds % 3600) // 60 + if minutes > 0: + parts.append(f"{minutes} minute{'s' if minutes > 1 else ''}") + + seconds = delta.seconds % 60 + if seconds > 0: + parts.append(f"{seconds} second{'s' if seconds > 1 else ''}") + + return " ".join(parts) def bytes_to_readable(size_bytes: int) -> str: @@ -70,8 +85,8 @@ def bytes_to_readable(size_bytes: int) -> str: Returns: A human-readable string representing the file size. """ - if size_bytes == 0: - return "0B" + if size_bytes < 1024: + return f"{size_bytes} B" size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") i = int(math.floor(math.log(size_bytes, 1024))) p = math.pow(1024, i) @@ -99,14 +114,19 @@ def readable_to_bytes(size_str: str) -> int: 'TB': 1024**4, 'PB': 1024**5, 'EB': 1024**6, - 'ZB': 1024**7, + 'ZB': 1024**7, 'YB': 1024**8 } size_str = size_str.upper().strip() - number, unit = size_str[:-2].strip(), size_str[-2:].strip() - if unit not in SIZE_UNITS or not number.isnumeric(): - raise ValueError("Invalid input for size_str. Example: 1024GB") - return int(number) * SIZE_UNITS[unit] + if size_str.isnumeric(): + # If the string is purely numeric, assume it's in bytes + return int(size_str) + match = re.fullmatch(r'^(\d+(\.\d*)?)\s*([KMGTPEZY]?B)?$', size_str) + if not match: + raise ValueError("Invalid size format for size_str. Expected format like '10MB', '1024GB', or '512'.") + unit = ''.join(filter(str.isalpha, size_str)).strip() + number = ''.join(c for c in size_str if c.isdigit() or c == '.').strip() + return int(float(number) * SIZE_UNITS[unit]) def sanitize_except(html: str) -> Markup: @@ -149,41 +169,44 @@ def check_update(): try: raw = urllib.request.urlopen(PACKAGE_JSON) except Exception: - print("Failed to check for updates.") + logger.error("Failed to check for updates.") return 1 else: with raw: package_info = json.load(raw) - latest_version = tuple(map(int, package_info["info"]["version"].split("."))) + latest_version = tuple( + map(int, package_info["info"]["version"].split("."))) __version__ = importlib.metadata.version("whatsapp_chat_exporter") current_version = tuple(map(int, __version__.split("."))) if current_version < latest_version: - print("===============Update===============") - print("A newer version of WhatsApp Chat Exporter is available.") - print("Current version: " + __version__) - print("Latest version: " + package_info["info"]["version"]) + logger.info( + "===============Update===============\n" + "A newer version of WhatsApp Chat Exporter is available.\n" + f"Current version: {__version__}\n" + f"Latest version: {package_info['info']['version']}\n" + ) if platform == "win32": - print("Update with: pip install --upgrade whatsapp-chat-exporter") + logger.info("Update with: pip install --upgrade whatsapp-chat-exporter\n") else: - print("Update with: pip3 install --upgrade whatsapp-chat-exporter") - print("====================================") + logger.info("Update with: pip3 install --upgrade whatsapp-chat-exporter\n") + logger.info("====================================\n") else: - print("You are using the latest version of WhatsApp Chat Exporter.") + logger.info("You are using the latest version of WhatsApp Chat Exporter.\n") return 0 def rendering( - output_file_name, - template, - name, - msgs, - contact, - w3css, - chat, - headline, - next=False, - previous=False - ): + output_file_name, + template, + name, + msgs, + contact, + w3css, + chat, + headline, + next=False, + previous=False +): if chat.their_avatar_thumb is None and chat.their_avatar is not None: their_avatar_thumb = chat.their_avatar else: @@ -215,59 +238,104 @@ class Device(StrEnum): EXPORTED = "exported" -def import_from_json(json_file: str, data: Dict[str, ChatStore]): +def import_from_json(json_file: str, data: ChatCollection): """Imports chat data from a JSON file into the data dictionary. Args: json_file: The path to the JSON file. data: The dictionary to store the imported chat data. """ - from Whatsapp_Chat_Exporter.data_model import ChatStore, Message with open(json_file, "r") as f: temp_data = json.loads(f.read()) total_row_number = len(tuple(temp_data.keys())) - print(f"Importing chats from JSON...(0/{total_row_number})", end="\r") + logger.info(f"Importing chats from JSON...(0/{total_row_number})\r") for index, (jid, chat_data) in enumerate(temp_data.items()): - chat = ChatStore(chat_data.get("type"), chat_data.get("name")) - chat.my_avatar = chat_data.get("my_avatar") - chat.their_avatar = chat_data.get("their_avatar") - chat.their_avatar_thumb = chat_data.get("their_avatar_thumb") - chat.status = chat_data.get("status") - for id, msg in chat_data.get("messages").items(): - message = Message( - from_me=msg["from_me"], - timestamp=msg["timestamp"], - time=msg["time"], - key_id=msg["key_id"], - received_timestamp=msg.get("received_timestamp"), - read_timestamp=msg.get("read_timestamp") - ) - message.media = msg.get("media") - message.meta = msg.get("meta") - message.data = msg.get("data") - message.sender = msg.get("sender") - message.safe = msg.get("safe") - message.mime = msg.get("mime") - message.reply = msg.get("reply") - message.quoted_data = msg.get("quoted_data") - message.caption = msg.get("caption") - message.thumb = msg.get("thumb") - message.sticker = msg.get("sticker") - chat.add_message(id, message) - data[jid] = chat - print(f"Importing chats from JSON...({index + 1}/{total_row_number})", end="\r") + chat = ChatStore.from_json(chat_data) + data.add_chat(jid, chat) + logger.info( + f"Importing chats from JSON...({index + 1}/{total_row_number})\r") + logger.info(f"Imported {total_row_number} chats from JSON{CLEAR_LINE}") -def sanitize_filename(file_name: str) -> str: - """Sanitizes a filename by removing invalid and unsafe characters. +def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_print_json: int, avoid_encoding_json: bool): + """Merges JSON files from the source directory into the target directory. Args: - file_name: The filename to sanitize. - - Returns: - The sanitized filename. + source_dir (str): The path to the source directory containing JSON files. + target_dir (str): The path to the target directory to merge into. + media_dir (str): The path to the media directory. """ - return "".join(x for x in file_name if x.isalnum() or x in "- ") + json_files = [f for f in os.listdir(source_dir) if f.endswith('.json')] + if not json_files: + logger.error("No JSON files found in the source directory.") + return + + logger.info("JSON files found:", json_files) + + for json_file in json_files: + source_path = os.path.join(source_dir, json_file) + target_path = os.path.join(target_dir, json_file) + + if not os.path.exists(target_path): + logger.info(f"Copying '{json_file}' to target directory...") + os.makedirs(target_dir, exist_ok=True) + shutil.copy2(source_path, target_path) + else: + logger.info( + f"Merging '{json_file}' with existing file in target directory...") + with open(source_path, 'r') as src_file, open(target_path, 'r') as tgt_file: + source_data = json.load(src_file) + target_data = json.load(tgt_file) + + # Parse JSON into ChatStore objects using from_json() + source_chats = {jid: ChatStore.from_json( + chat) for jid, chat in source_data.items()} + target_chats = {jid: ChatStore.from_json( + chat) for jid, chat in target_data.items()} + + # Merge chats using merge_with() + for jid, chat in source_chats.items(): + if jid in target_chats: + target_chats[jid].merge_with(chat) + else: + target_chats[jid] = chat + + # Serialize merged data + merged_data = {jid: chat.to_json() + for jid, chat in target_chats.items()} + + # Check if the merged data differs from the original target data + if json.dumps(merged_data, sort_keys=True) != json.dumps(target_data, sort_keys=True): + logger.info( + f"Changes detected in '{json_file}', updating target file...") + with open(target_path, 'w') as merged_file: + json.dump( + merged_data, + merged_file, + indent=pretty_print_json, + ensure_ascii=not avoid_encoding_json, + ) + else: + logger.info( + f"No changes detected in '{json_file}', skipping update.") + + # Merge media directories + source_media_path = os.path.join(source_dir, media_dir) + target_media_path = os.path.join(target_dir, media_dir) + logger.info( + f"Merging media directories. Source: {source_media_path}, target: {target_media_path}") + if os.path.exists(source_media_path): + for root, _, files in os.walk(source_media_path): + relative_path = os.path.relpath(root, source_media_path) + target_root = os.path.join(target_media_path, relative_path) + os.makedirs(target_root, exist_ok=True) + for file in files: + source_file = os.path.join(root, file) + target_file = os.path.join(target_root, file) + # we only copy if the file doesn't exist in the target or if the source is newer + if not os.path.exists(target_file) or os.path.getmtime(source_file) > os.path.getmtime(target_file): + logger.info(f"Copying '{source_file}' to '{target_file}'...") + shutil.copy2(source_file, target_file) def get_file_name(contact: str, chat: ChatStore) -> Tuple[str, str]: @@ -299,7 +367,7 @@ def get_file_name(contact: str, chat: ChatStore) -> Tuple[str, str]: else: name = phone_number - return sanitize_filename(file_name), name + return safe_name(file_name), name def get_cond_for_empty(enable: bool, jid_field: str, broadcast_field: str) -> str: @@ -335,23 +403,29 @@ def get_chat_condition(filter: Optional[List[str]], include: bool, columns: List if filter is not None: conditions = [] if len(columns) < 2 and jid is not None: - raise ValueError("There must be at least two elements in argument columns if jid is not None") + raise ValueError( + "There must be at least two elements in argument columns if jid is not None") if jid is not None: if platform == "android": is_group = f"{jid}.type == 1" elif platform == "ios": is_group = f"{jid} IS NOT NULL" else: - raise ValueError("Only android and ios are supported for argument platform if jid is not None") + raise ValueError( + "Only android and ios are supported for argument platform if jid is not None") for index, chat in enumerate(filter): if include: - conditions.append(f"{' OR' if index > 0 else ''} {columns[0]} LIKE '%{chat}%'") + conditions.append( + f"{' OR' if index > 0 else ''} {columns[0]} LIKE '%{chat}%'") if len(columns) > 1: - conditions.append(f" OR ({columns[1]} LIKE '%{chat}%' AND {is_group})") + conditions.append( + f" OR ({columns[1]} LIKE '%{chat}%' AND {is_group})") else: - conditions.append(f"{' AND' if index > 0 else ''} {columns[0]} NOT LIKE '%{chat}%'") + conditions.append( + f"{' AND' if index > 0 else ''} {columns[0]} NOT LIKE '%{chat}%'") if len(columns) > 1: - conditions.append(f" AND ({columns[1]} NOT LIKE '%{chat}%' AND {is_group})") + conditions.append( + f" AND ({columns[1]} NOT LIKE '%{chat}%' AND {is_group})") return f"AND ({' '.join(conditions)})" else: return "" @@ -447,7 +521,7 @@ def determine_metadata(content: sqlite3.Row, init_msg: Optional[str]) -> Optiona else: msg = f"{old} changed their number to {new}" elif content["action_type"] == 46: - return # Voice message in PM??? Seems no need to handle. + return # Voice message in PM??? Seems no need to handle. elif content["action_type"] == 47: msg = "The contact is an official business account" elif content["action_type"] == 50: @@ -464,7 +538,8 @@ def determine_metadata(content: sqlite3.Row, init_msg: Optional[str]) -> Optiona elif content["action_type"] == 67: return # (PM) this contact use secure service from Facebook??? elif content["action_type"] == 69: - return # (PM) this contact use secure service from Facebook??? What's the difference with 67???? + # (PM) this contact use secure service from Facebook??? What's the difference with 67???? + return else: return # Unsupported return msg @@ -491,8 +566,10 @@ def get_status_location(output_folder: str, offline_static: str) -> str: w3css_path = os.path.join(static_folder, "w3.css") if not os.path.isfile(w3css_path): with urllib.request.urlopen(w3css) as resp: - with open(w3css_path, "wb") as f: f.write(resp.read()) + with open(w3css_path, "wb") as f: + f.write(resp.read()) w3css = os.path.join(offline_static, "w3.css") + return w3css def setup_template(template: Optional[str], no_avatar: bool, experimental: bool = False) -> jinja2.Template: @@ -522,43 +599,130 @@ def setup_template(template: Optional[str], no_avatar: bool, experimental: bool template_env.filters['sanitize_except'] = sanitize_except return template_env.get_template(template_file) + # iOS Specific APPLE_TIME = 978307200 -def slugify(value: str, allow_unicode: bool = False) -> str: +def safe_name(text: Union[str, bytes]) -> str: """ - Convert text to ASCII-only slugs for URL-safe strings. - Taken from https://github.com/django/django/blob/master/django/utils/text.py + Sanitize the input text and generates a safe file name. + This function serves a similar purpose to slugify() from + Django previously used in this project, but is a clean-room + Reimplementation tailored for performance and a narrower + Use case for this project. Licensed under the same terms + As the project (MIT). Args: - value (str): The string to convert to a slug. - allow_unicode (bool, optional): Whether to allow Unicode characters. Defaults to False. + text (str|bytes): The string to be sanitized. Returns: - str: The slugified string with only alphanumerics, underscores, or hyphens. + str: The sanitized string with only alphanumerics, underscores, or hyphens. """ - value = str(value) - if allow_unicode: - value = unicodedata.normalize('NFKC', value) - else: - value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') - value = re.sub(r'[^\w\s-]', '', value.lower()) - return re.sub(r'[-\s]+', '-', value).strip('-_') + if isinstance(text, bytes): + text = text.decode("utf-8", "ignore") + elif not isinstance(text, str): + raise TypeError("value must be a string or bytes") + normalized_text = unicodedata.normalize("NFKC", text) + safe_chars = [char for char in normalized_text if char.isalnum() or char in "-_ ."] + return "-".join(''.join(safe_chars).split()) + + +def get_from_string(msg: Dict, chat_id: str) -> str: + """Return the number or name for the sender""" + if msg["from_me"]: + return "Me" + if msg["sender"]: + return str(msg["sender"]) + return str(chat_id) + + +def get_chat_type(chat_id: str) -> str: + """Return the chat type based on the whatsapp id""" + if chat_id.endswith("@s.whatsapp.net"): + return "personal_chat" + if chat_id.endswith("@g.us"): + return "private_group" + logger.warning("Unknown chat type for %s, defaulting to private_group", chat_id) + return "private_group" + + +def get_from_id(msg: Dict, chat_id: str) -> str: + """Return the user id for the sender""" + if msg["from_me"]: + return "user00000" + if msg["sender"]: + return "user" + msg["sender"] + return f"user{chat_id}" + + +def get_reply_id(data: Dict, reply_key: int) -> Optional[int]: + """Get the id of the message corresponding to the reply""" + if not reply_key: + return None + for msg_id, msg in data["messages"].items(): + if msg["key_id"] == reply_key: + return msg_id + return None + + +def telegram_json_format(jik: str, data: Dict, timezone_offset) -> Dict: + """Convert the data to the Telegram export format""" + timing = Timing(timezone_offset or CURRENT_TZ_OFFSET) + try: + chat_id = int(''.join([c for c in jik if c.isdigit()])) + except ValueError: + # not a real chat: e.g. statusbroadcast + chat_id = 0 + obj = { + "name": data["name"] if data["name"] else jik, + "type": get_chat_type(jik), + "id": chat_id, + "messages": [ { + "id": int(msgId), + "type": "message", + "date": timing.format_timestamp(msg["timestamp"], "%Y-%m-%dT%H:%M:%S"), + "date_unixtime": int(msg["timestamp"]), + "from": get_from_string(msg, chat_id), + "from_id": get_from_id(msg, chat_id), + "reply_to_message_id": get_reply_id(data, msg["reply"]), + "text": msg["data"], + "text_entities": [ + { + # TODO this will lose formatting and different types + "type": "plain", + "text": msg["data"], + } + ], + } for msgId, msg in data["messages"].items()] + } + # remove empty messages and replies + for msg_id, msg in enumerate(obj["messages"]): + if not msg["reply_to_message_id"]: + del obj["messages"][msg_id]["reply_to_message_id"] + obj["messages"] = [m for m in obj["messages"] if m["text"]] + return obj class WhatsAppIdentifier(StrEnum): - MESSAGE = "7c7fba66680ef796b916b067077cc246adacf01d" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ChatStorage.sqlite - CONTACT = "b8548dc30aa1030df0ce18ef08b882cf7ab5212f" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ContactsV2.sqlite - CALL = "1b432994e958845fffe8e2f190f26d1511534088" # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-CallHistory.sqlite + # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ChatStorage.sqlite + MESSAGE = "7c7fba66680ef796b916b067077cc246adacf01d" + # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-ContactsV2.sqlite + CONTACT = "b8548dc30aa1030df0ce18ef08b882cf7ab5212f" + # AppDomainGroup-group.net.whatsapp.WhatsApp.shared-CallHistory.sqlite + CALL = "1b432994e958845fffe8e2f190f26d1511534088" DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsApp.shared" class WhatsAppBusinessIdentifier(StrEnum): - MESSAGE = "724bd3b98b18518b455a87c1f3ac3a0d189c4466" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ChatStorage.sqlite - CONTACT = "d7246a707f51ddf8b17ee2dddabd9e0a4da5c552" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ContactsV2.sqlite - CALL = "b463f7c4365eefc5a8723930d97928d4e907c603" # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-CallHistory.sqlite - DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared" + # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ChatStorage.sqlite + MESSAGE = "724bd3b98b18518b455a87c1f3ac3a0d189c4466" + # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-ContactsV2.sqlite + CONTACT = "d7246a707f51ddf8b17ee2dddabd9e0a4da5c552" + # AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared-CallHistory.sqlite + CALL = "b463f7c4365eefc5a8723930d97928d4e907c603" + DOMAIN = "AppDomainGroup-group.net.whatsapp.WhatsAppSMB.shared" + class JidType(IntEnum): PM = 0 diff --git a/Whatsapp_Chat_Exporter/vcards_contacts.py b/Whatsapp_Chat_Exporter/vcards_contacts.py index ea38371..03d60ce 100644 --- a/Whatsapp_Chat_Exporter/vcards_contacts.py +++ b/Whatsapp_Chat_Exporter/vcards_contacts.py @@ -1,5 +1,12 @@ -import vobject +import logging +import re +import quopri from typing import List, TypedDict +from Whatsapp_Chat_Exporter.data_model import ChatStore +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, Device + + +logger = logging.getLogger(__name__) class ExportedContactNumbers(TypedDict): @@ -21,32 +28,155 @@ class ContactsFromVCards: for number, name in self.contact_mapping: # short number must be a bad contact, lets skip it if len(number) <= 5: + continue + chats_search = filter_chats_by_prefix(chats, number).values() + if chats_search: + for chat in chats_search: + if not hasattr(chat, 'name') or (hasattr(chat, 'name') and chat.name is None): + setattr(chat, 'name', name) + else: + chats.add_chat(number + "@s.whatsapp.net", ChatStore(Device.ANDROID, name)) + + +def decode_quoted_printable(value: str, charset: str) -> str: + """Decode a vCard value that may be quoted-printable UTF-8.""" + try: + bytes_val = quopri.decodestring(value) + return bytes_val.decode(charset, errors="replace") + except Exception: + # Fallback: return the original value if decoding fails + logger.warning( + f"Failed to decode quoted-printable value: {value}, " + f"charset: {charset}. Please report this issue.{CLEAR_LINE}" + ) + return value + +def _parse_vcard_line(line: str) -> tuple[str, dict[str, str], str] | None: + """ + Parses a single vCard property line into its components: + Property Name, Parameters (as a dict), and Value. + + Example: 'FN;CHARSET=UTF-8:John Doe' -> ('FN', {'CHARSET': 'UTF-8'}, 'John Doe') + """ + # Find the first colon, which separates the property/parameters from the value. + colon_index = line.find(':') + if colon_index == -1: + return None # Invalid vCard line format + + prop_and_params = line[:colon_index].strip() + value = line[colon_index + 1:].strip() + + # Split property name from parameters + parts = prop_and_params.split(';') + property_name = parts[0].upper() + + parameters = {} + for part in parts[1:]: + if '=' in part: + key, val = part.split('=', 1) + parameters[key.upper()] = val.strip('"') # Remove potential quotes from value + + return property_name, parameters, value + + +def get_vcard_value(entry: str, field_name: str) -> list[str]: + """ + Scans the vCard entry for lines starting with the specific field_name + and returns a list of its decoded values, handling parameters like + ENCODING and CHARSET. + """ + target_name = field_name.upper() + cached_line = "" + charset = "utf-8" + values = [] + + for line in entry.splitlines(): + line = line.strip() + if cached_line: + if line.endswith('='): + cached_line += line[:-1] + continue # Wait for the next line to complete the value + values.append(decode_quoted_printable(cached_line + line, charset)) + cached_line = "" + else: + # Skip empty lines or lines that don't start with the target field (after stripping) + if not line or not line.upper().startswith(target_name): continue - for chat in filter_chats_by_prefix(chats, number).values(): - if not hasattr(chat, 'name') or (hasattr(chat, 'name') and chat.name is None): - setattr(chat, 'name', name) + parsed = _parse_vcard_line(line) + if parsed is None: + continue + + prop_name, params, raw_value = parsed + + if prop_name != target_name: + continue + + encoding = params.get('ENCODING') + charset = params.get('CHARSET', 'utf-8') + + # Apply decoding if ENCODING parameter is present + if encoding == 'QUOTED-PRINTABLE': + if raw_value.endswith('='): + # Handle soft line breaks in quoted-printable and cache the line + cached_line += raw_value[:-1] + continue # Wait for the next line to complete the value + values.append(decode_quoted_printable(raw_value, charset)) + elif encoding: + raise NotImplementedError(f"Encoding '{encoding}' not supported yet.") + else: + values.append(raw_value) + return values + + +def process_vcard_entry(entry: str) -> dict | bool: + """ + Process a vCard entry using pure string manipulation + + Args: + entry: A string containing a single vCard block. + + Returns: + A dictionary of the extracted data or False if required fields are missing. + """ + + name = None + + # Extract name in priority: FN -> N -> ORG + for field in ("FN", "N", "ORG"): + if name_values := get_vcard_value(entry, field): + name = name_values[0].replace(';', ' ') # Simple cleanup for structured name + break + + if not name: + return False + + numbers = get_vcard_value(entry, "TEL") + if not numbers: + return False + + return { + "full_name": name, + # Remove duplications + "numbers": set(numbers), + } def read_vcards_file(vcf_file_path, default_country_code: str): contacts = [] - with open(vcf_file_path, mode="r", encoding="utf-8") as f: - reader = vobject.readComponents(f) - for row in reader: - if hasattr(row, 'fn'): - name = str(row.fn.value) - elif hasattr(row, 'n'): - name = str(row.n.value) - else: - name = None - if not hasattr(row, 'tel') or name is None: - continue - contact: ExportedContactNumbers = { - "full_name": name, - "numbers": list(map(lambda tel: tel.value, row.tel_list)), - } + with open(vcf_file_path, "r", encoding="utf-8", errors="ignore") as f: + content = f.read() + + # Split into individual vCards + vcards = content.split("BEGIN:VCARD") + for vcard in vcards: + if "END:VCARD" not in vcard: + continue + + if contact := process_vcard_entry(vcard): contacts.append(contact) + logger.info(f"Imported {len(contacts)} contacts/vcards{CLEAR_LINE}") return map_number_to_name(contacts, default_country_code) @@ -77,6 +207,6 @@ def normalize_number(number: str, country_code: str): return number[len(starting_char):] # leading zero should be removed - if starting_char == '0': + if number.startswith('0'): number = number[1:] return country_code + number # fall back diff --git a/Whatsapp_Chat_Exporter/vcards_contacts_test.py b/Whatsapp_Chat_Exporter/vcards_contacts_test.py deleted file mode 100644 index 194b637..0000000 --- a/Whatsapp_Chat_Exporter/vcards_contacts_test.py +++ /dev/null @@ -1,20 +0,0 @@ -# from contacts_names_from_vcards import readVCardsFile - -from Whatsapp_Chat_Exporter.vcards_contacts import normalize_number, read_vcards_file - - -def test_readVCardsFile(): - assert len(read_vcards_file("contacts.vcf", "973")) > 0 - -def test_create_number_to_name_dicts(): - pass - -def test_fuzzy_match_numbers(): - pass - -def test_normalize_number(): - assert normalize_number('0531234567', '1') == '1531234567' - assert normalize_number('001531234567', '2') == '1531234567' - assert normalize_number('+1531234567', '34') == '1531234567' - assert normalize_number('053(123)4567', '34') == '34531234567' - assert normalize_number('0531-234-567', '58') == '58531234567' diff --git a/Whatsapp_Chat_Exporter/whatsapp.html b/Whatsapp_Chat_Exporter/whatsapp.html index ae83b95..2aa2b7e 100644 --- a/Whatsapp_Chat_Exporter/whatsapp.html +++ b/Whatsapp_Chat_Exporter/whatsapp.html @@ -1,329 +1,467 @@ - - Whatsapp - {{ name }} - - - - - - -
- {{ headline }} - {% if status is not none %} -
- {{ status }} - {% endif %} -
-
-
- {% set last = {'last': 946688461.001} %} - {% for msg in msgs -%} -
- {% if determine_day(last.last, msg.timestamp) is not none %} -
{{ determine_day(last.last, msg.timestamp) }}
- {% if last.update({'last': msg.timestamp}) %}{% endif %} - {% endif %} - {% if msg.from_me == true %} -
-
{{ msg.time }}
-
You
-
-
- {% if not no_avatar and my_avatar is not none %} -
- {% else %} -
- {% endif %} -
- {% if msg.reply is not none %} - - {% endif %} - {% if msg.meta == true or msg.media == false and msg.data is none %} -
- {% if msg.safe %} -

{{ msg.data | safe or 'Not supported WhatsApp internal message' }}

- {% else %} -

{{ msg.data or 'Not supported WhatsApp internal message' }}

- {% endif %} -
- {% if msg.caption is not none %} -
- {{ msg.caption | urlize(none, true, '_blank') }} -
- {% endif %} - {% else %} - {% if msg.media == false %} - {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} - {% else %} - {% if "image/" in msg.mime %} - - - - {% elif "audio/" in msg.mime %} - - {% elif "video/" in msg.mime %} - - {% elif "/" in msg.mime %} -
-

The file cannot be displayed here, however it should be located at here

-
- {% else %} - {% filter escape %}{{ msg.data }}{% endfilter %} - {% endif %} - {% if msg.caption is not none %} -
- {{ msg.caption | urlize(none, true, '_blank') }} -
- {% endif %} - {% endif %} - {% endif %} -
-
- {% if not no_avatar and my_avatar is not none %} -
- - - -
- {% endif %} -
- {% else %} -
-
- {% if msg.sender is not none %} - {{ msg.sender }} - {% else %} - {{ name }} - {% endif %} -
-
{{ msg.time }}
-
-
- {% if not no_avatar %} -
- {% if their_avatar is not none %} - - {% else %} - - {% endif %} -
-
- {% else %} -
- {% endif %} -
- {% if msg.reply is not none %} - - {% endif %} - {% if msg.meta == true or msg.media == false and msg.data is none %} -
- {% if msg.safe %} -

{{ msg.data | safe or 'Not supported WhatsApp internal message' }}

- {% else %} -

{{ msg.data or 'Not supported WhatsApp internal message' }}

- {% endif %} -
- {% if msg.caption is not none %} -
- {{ msg.caption | urlize(none, true, '_blank') }} -
- {% endif %} - {% else %} - {% if msg.media == false %} - {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} - {% else %} - {% if "image/" in msg.mime %} - - - - {% elif "audio/" in msg.mime %} - - {% elif "video/" in msg.mime %} - - {% elif "/" in msg.mime %} -
-

The file cannot be displayed here, however it should be located at here

-
- {% else %} - {% filter escape %}{{ msg.data }}{% endfilter %} - {% endif %} - {% if msg.caption is not none %} -
- {{ msg.caption | urlize(none, true, '_blank') }} -
- {% endif %} - {% endif %} - {% endif %} -
-
-
- {% endif %} -
- {% endfor %} -
-
- - + + + + + + +
+
+
+
+ {% if not no_avatar %} +
+ {% if their_avatar is not none %} + + {% else %} + + {% endif %} +
+ {% endif %} +
+

{{ headline }}

+ {% if status is not none %}

{{ status }}

{% endif %} +
+
+
+ + + {% if previous %} + + + + + + {% endif %} + {% if next %} + + + + + + {% endif %} +
+ +
+ + +
+
+
+
+
+ + {% set last = {'last': 946688461.001} %} + {% for msg in msgs -%} + {% if determine_day(last.last, msg.timestamp) is not none %} +
+
+ {{ determine_day(last.last, msg.timestamp) }} +
+
+ {% if last.update({'last': msg.timestamp}) %}{% endif %} + {% endif %} + + {% if msg.from_me == true %} +
+
+
+
+ + + + +
+
+
+
+ {% if msg.reply is not none %} + +
+

Replying to

+

+ {% if msg.quoted_data is not none %} + "{{msg.quoted_data}}" + {% else %} + this message + {% endif %} +

+
+
+ {% endif %} +

+ {% if msg.meta == true or msg.media == false and msg.data is none %} +

+
+ {% if msg.safe %} + {{ msg.data | safe or 'Not supported WhatsApp internal message' }} + {% else %} + {{ msg.data or 'Not supported WhatsApp internal message' }} + {% endif %} +
+
+ {% if msg.caption is not none %} +

{{ msg.caption | urlize(none, true, '_blank') }}

+ {% endif %} + {% else %} + {% if msg.media == false %} + {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} + {% else %} + {% if "image/" in msg.mime %} + + + + {% elif "audio/" in msg.mime %} + + {% elif "video/" in msg.mime %} + + {% elif "/" in msg.mime %} + The file cannot be displayed here, however it should be located at here + {% else %} + {% filter escape %}{{ msg.data }}{% endfilter %} + {% endif %} + {% if msg.caption is not none %} + {{ msg.caption | urlize(none, true, '_blank') }} + {% endif %} + {% endif %} + {% endif %} +

+

{{ msg.time }}

+
+
+ {% else %} +
+
+ {% if msg.reply is not none %} + +
+

Replying to

+

+ {% if msg.quoted_data is not none %} + {{msg.quoted_data}} + {% else %} + this message + {% endif %} +

+
+
+ {% endif %} +

+ {% if msg.meta == true or msg.media == false and msg.data is none %} +

+
+ {% if msg.safe %} + {{ msg.data | safe or 'Not supported WhatsApp internal message' }} + {% else %} + {{ msg.data or 'Not supported WhatsApp internal message' }} + {% endif %} +
+
+ {% if msg.caption is not none %} +

{{ msg.caption | urlize(none, true, '_blank') }}

+ {% endif %} + {% else %} + {% if msg.media == false %} + {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} + {% else %} + {% if "image/" in msg.mime %} + + + + {% elif "audio/" in msg.mime %} + + {% elif "video/" in msg.mime %} + + {% elif "/" in msg.mime %} + The file cannot be displayed here, however it should be located at here + {% else %} + {% filter escape %}{{ msg.data }}{% endfilter %} + {% endif %} + {% if msg.caption is not none %} + {{ msg.caption | urlize(none, true, '_blank') }} + {% endif %} + {% endif %} + {% endif %} +

+
+ + {% if msg.sender is not none %} + {{ msg.sender }} + {% endif %} + + + {{ msg.time }} +
+
+ +
+ {% endif %} + {% endfor %} +
+
+

+ {% if not next %} + End of History + {% endif %} +

+
+ Portions of this page are reproduced from work created and shared by Google and used according to terms described in the Apache 2.0 License. +
+ + + + + + +
+
+ + - - + // Event listeners + searchButton.addEventListener('click', showSearch); + closeMainSearch.addEventListener('click', hideSearch); + + // Handle ESC key + document.addEventListener('keydown', (event) => { + if (event.key === 'Escape' && mainSearchInput.classList.contains('active')) { + hideSearch(); + } + }); + + + \ No newline at end of file diff --git a/Whatsapp_Chat_Exporter/whatsapp_new.html b/Whatsapp_Chat_Exporter/whatsapp_new.html deleted file mode 100644 index 2aa2b7e..0000000 --- a/Whatsapp_Chat_Exporter/whatsapp_new.html +++ /dev/null @@ -1,467 +0,0 @@ - - - - Whatsapp - {{ name }} - - - - - - - - -
-
-
-
- {% if not no_avatar %} -
- {% if their_avatar is not none %} - - {% else %} - - {% endif %} -
- {% endif %} -
-

{{ headline }}

- {% if status is not none %}

{{ status }}

{% endif %} -
-
-
- - - {% if previous %} - - - - - - {% endif %} - {% if next %} - - - - - - {% endif %} -
- -
- - -
-
-
-
-
- - {% set last = {'last': 946688461.001} %} - {% for msg in msgs -%} - {% if determine_day(last.last, msg.timestamp) is not none %} -
-
- {{ determine_day(last.last, msg.timestamp) }} -
-
- {% if last.update({'last': msg.timestamp}) %}{% endif %} - {% endif %} - - {% if msg.from_me == true %} -
-
-
-
- - - - -
-
-
-
- {% if msg.reply is not none %} - -
-

Replying to

-

- {% if msg.quoted_data is not none %} - "{{msg.quoted_data}}" - {% else %} - this message - {% endif %} -

-
-
- {% endif %} -

- {% if msg.meta == true or msg.media == false and msg.data is none %} -

-
- {% if msg.safe %} - {{ msg.data | safe or 'Not supported WhatsApp internal message' }} - {% else %} - {{ msg.data or 'Not supported WhatsApp internal message' }} - {% endif %} -
-
- {% if msg.caption is not none %} -

{{ msg.caption | urlize(none, true, '_blank') }}

- {% endif %} - {% else %} - {% if msg.media == false %} - {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} - {% else %} - {% if "image/" in msg.mime %} - - - - {% elif "audio/" in msg.mime %} - - {% elif "video/" in msg.mime %} - - {% elif "/" in msg.mime %} - The file cannot be displayed here, however it should be located at here - {% else %} - {% filter escape %}{{ msg.data }}{% endfilter %} - {% endif %} - {% if msg.caption is not none %} - {{ msg.caption | urlize(none, true, '_blank') }} - {% endif %} - {% endif %} - {% endif %} -

-

{{ msg.time }}

-
-
- {% else %} -
-
- {% if msg.reply is not none %} - -
-

Replying to

-

- {% if msg.quoted_data is not none %} - {{msg.quoted_data}} - {% else %} - this message - {% endif %} -

-
-
- {% endif %} -

- {% if msg.meta == true or msg.media == false and msg.data is none %} -

-
- {% if msg.safe %} - {{ msg.data | safe or 'Not supported WhatsApp internal message' }} - {% else %} - {{ msg.data or 'Not supported WhatsApp internal message' }} - {% endif %} -
-
- {% if msg.caption is not none %} -

{{ msg.caption | urlize(none, true, '_blank') }}

- {% endif %} - {% else %} - {% if msg.media == false %} - {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} - {% else %} - {% if "image/" in msg.mime %} - - - - {% elif "audio/" in msg.mime %} - - {% elif "video/" in msg.mime %} - - {% elif "/" in msg.mime %} - The file cannot be displayed here, however it should be located at here - {% else %} - {% filter escape %}{{ msg.data }}{% endfilter %} - {% endif %} - {% if msg.caption is not none %} - {{ msg.caption | urlize(none, true, '_blank') }} - {% endif %} - {% endif %} - {% endif %} -

-
- - {% if msg.sender is not none %} - {{ msg.sender }} - {% endif %} - - - {{ msg.time }} -
-
- -
- {% endif %} - {% endfor %} -
-
-

- {% if not next %} - End of History - {% endif %} -

-
- Portions of this page are reproduced from work created and shared by Google and used according to terms described in the Apache 2.0 License. -
- - - - - - -
-
- - - - - \ No newline at end of file diff --git a/Whatsapp_Chat_Exporter/whatsapp_old.html b/Whatsapp_Chat_Exporter/whatsapp_old.html new file mode 100644 index 0000000..ae83b95 --- /dev/null +++ b/Whatsapp_Chat_Exporter/whatsapp_old.html @@ -0,0 +1,329 @@ + + + + Whatsapp - {{ name }} + + + + + + +
+ {{ headline }} + {% if status is not none %} +
+ {{ status }} + {% endif %} +
+
+
+ {% set last = {'last': 946688461.001} %} + {% for msg in msgs -%} +
+ {% if determine_day(last.last, msg.timestamp) is not none %} +
{{ determine_day(last.last, msg.timestamp) }}
+ {% if last.update({'last': msg.timestamp}) %}{% endif %} + {% endif %} + {% if msg.from_me == true %} +
+
{{ msg.time }}
+
You
+
+
+ {% if not no_avatar and my_avatar is not none %} +
+ {% else %} +
+ {% endif %} +
+ {% if msg.reply is not none %} + + {% endif %} + {% if msg.meta == true or msg.media == false and msg.data is none %} +
+ {% if msg.safe %} +

{{ msg.data | safe or 'Not supported WhatsApp internal message' }}

+ {% else %} +

{{ msg.data or 'Not supported WhatsApp internal message' }}

+ {% endif %} +
+ {% if msg.caption is not none %} +
+ {{ msg.caption | urlize(none, true, '_blank') }} +
+ {% endif %} + {% else %} + {% if msg.media == false %} + {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} + {% else %} + {% if "image/" in msg.mime %} + + + + {% elif "audio/" in msg.mime %} + + {% elif "video/" in msg.mime %} + + {% elif "/" in msg.mime %} +
+

The file cannot be displayed here, however it should be located at here

+
+ {% else %} + {% filter escape %}{{ msg.data }}{% endfilter %} + {% endif %} + {% if msg.caption is not none %} +
+ {{ msg.caption | urlize(none, true, '_blank') }} +
+ {% endif %} + {% endif %} + {% endif %} +
+
+ {% if not no_avatar and my_avatar is not none %} +
+ + + +
+ {% endif %} +
+ {% else %} +
+
+ {% if msg.sender is not none %} + {{ msg.sender }} + {% else %} + {{ name }} + {% endif %} +
+
{{ msg.time }}
+
+
+ {% if not no_avatar %} +
+ {% if their_avatar is not none %} + + {% else %} + + {% endif %} +
+
+ {% else %} +
+ {% endif %} +
+ {% if msg.reply is not none %} + + {% endif %} + {% if msg.meta == true or msg.media == false and msg.data is none %} +
+ {% if msg.safe %} +

{{ msg.data | safe or 'Not supported WhatsApp internal message' }}

+ {% else %} +

{{ msg.data or 'Not supported WhatsApp internal message' }}

+ {% endif %} +
+ {% if msg.caption is not none %} +
+ {{ msg.caption | urlize(none, true, '_blank') }} +
+ {% endif %} + {% else %} + {% if msg.media == false %} + {{ msg.data | sanitize_except() | urlize(none, true, '_blank') }} + {% else %} + {% if "image/" in msg.mime %} + + + + {% elif "audio/" in msg.mime %} + + {% elif "video/" in msg.mime %} + + {% elif "/" in msg.mime %} +
+

The file cannot be displayed here, however it should be located at here

+
+ {% else %} + {% filter escape %}{{ msg.data }}{% endfilter %} + {% endif %} + {% if msg.caption is not none %} +
+ {{ msg.caption | urlize(none, true, '_blank') }} +
+ {% endif %} + {% endif %} + {% endif %} +
+
+
+ {% endif %} +
+ {% endfor %} +
+
+ + + + + \ No newline at end of file diff --git a/imgs/group.png b/imgs/group.png deleted file mode 100644 index bf88028..0000000 Binary files a/imgs/group.png and /dev/null differ diff --git a/imgs/pm.png b/imgs/pm.png index 4906ba1..fa0359f 100644 Binary files a/imgs/pm.png and b/imgs/pm.png differ diff --git a/pyproject.toml b/pyproject.toml index f5931d4..ee8ec7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "whatsapp-chat-exporter" -version = "0.12.1" +version = "0.13.0rc1" description = "A Whatsapp database parser that provides history of your Whatsapp conversations in HTML and JSON. Android, iOS, iPadOS, Crypt12, Crypt14, Crypt15 supported." readme = "README.md" authors = [ @@ -19,11 +19,11 @@ keywords = [ ] classifiers = [ "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Development Status :: 4 - Beta", @@ -33,7 +33,7 @@ classifiers = [ "Topic :: Utilities", "Topic :: Database" ] -requires-python = ">=3.9" +requires-python = ">=3.10" dependencies = [ "jinja2", "bleach" @@ -44,10 +44,9 @@ android_backup = ["pycryptodome", "javaobj-py3"] crypt12 = ["pycryptodome"] crypt14 = ["pycryptodome"] crypt15 = ["pycryptodome", "javaobj-py3"] -all = ["pycryptodome", "javaobj-py3", "vobject"] -everything = ["pycryptodome", "javaobj-py3", "vobject"] +all = ["pycryptodome", "javaobj-py3"] +everything = ["pycryptodome", "javaobj-py3"] backup = ["pycryptodome", "javaobj-py3"] -vcards = ["vobject", "pycryptodome", "javaobj-py3"] [project.scripts] wtsexporter = "Whatsapp_Chat_Exporter.__main__:main" @@ -60,3 +59,8 @@ include = ["Whatsapp_Chat_Exporter"] [tool.setuptools.package-data] Whatsapp_Chat_Exporter = ["*.html"] + +[dependency-groups] +dev = [ + "pytest>=8.3.5", +] diff --git a/scripts/brazilian_number_processing.py b/scripts/brazilian_number_processing.py index bb51213..a42678c 100644 --- a/scripts/brazilian_number_processing.py +++ b/scripts/brazilian_number_processing.py @@ -6,19 +6,20 @@ Contributed by @magpires https://github.com/KnugiHK/WhatsApp-Chat-Exporter/issue import re import argparse + def process_phone_number(raw_phone): """ Process the raw phone string from the VCARD and return two formatted numbers: - The original formatted number, and - A modified formatted number with the extra (ninth) digit removed, if applicable. - + Desired output: For a number with a 9-digit subscriber: Original: "+55 {area} {first 5 of subscriber}-{last 4 of subscriber}" Modified: "+55 {area} {subscriber[1:5]}-{subscriber[5:]}" For example, for an input that should represent "027912345678", the outputs are: "+55 27 91234-5678" and "+55 27 1234-5678" - + This function handles numbers that may already include a "+55" prefix. It expects that after cleaning, a valid number (without the country code) should have either 10 digits (2 for area + 8 for subscriber) or 11 digits (2 for area + 9 for subscriber). @@ -26,18 +27,18 @@ def process_phone_number(raw_phone): """ # Store the original input for processing number_to_process = raw_phone.strip() - + # Remove all non-digit characters digits = re.sub(r'\D', '', number_to_process) - + # If the number starts with '55', remove it for processing if digits.startswith("55") and len(digits) > 11: digits = digits[2:] - + # Remove trunk zero if present if digits.startswith("0"): digits = digits[1:] - + # After cleaning, we expect a valid number to have either 10 or 11 digits # If there are extra digits, use the last 11 (for a 9-digit subscriber) or last 10 (for an 8-digit subscriber) if len(digits) > 11: @@ -46,7 +47,7 @@ def process_phone_number(raw_phone): elif len(digits) > 10 and len(digits) < 11: # In some cases with an 8-digit subscriber, take the last 10 digits digits = digits[-10:] - + # Check if we have a valid number after processing if len(digits) not in (10, 11): return None, None @@ -70,6 +71,7 @@ def process_phone_number(raw_phone): return original_formatted, modified_formatted + def process_vcard(input_vcard, output_vcard): """ Process a VCARD file to standardize telephone entries and add a second TEL line @@ -77,13 +79,13 @@ def process_vcard(input_vcard, output_vcard): """ with open(input_vcard, 'r', encoding='utf-8') as file: lines = file.readlines() - + output_lines = [] - + # Regex to capture any telephone line. # It matches lines starting with "TEL:" or "TEL;TYPE=..." or with prefixes like "item1.TEL:". phone_pattern = re.compile(r'^(?P.*TEL(?:;TYPE=[^:]+)?):(?P.*)$') - + for line in lines: stripped_line = line.rstrip("\n") match = phone_pattern.match(stripped_line) @@ -99,10 +101,11 @@ def process_vcard(input_vcard, output_vcard): output_lines.append(f"TEL;TYPE=CELL:{mod_formatted}\n") else: output_lines.append(line) - + with open(output_vcard, 'w', encoding='utf-8') as file: file.writelines(output_lines) + if __name__ == '__main__': parser = argparse.ArgumentParser( description="Process a VCARD file to standardize telephone entries and add a second TEL line with the modified number (removing the extra ninth digit) for contacts with 9-digit subscribers." @@ -110,6 +113,6 @@ if __name__ == '__main__': parser.add_argument('input_vcard', type=str, help='Input VCARD file') parser.add_argument('output_vcard', type=str, help='Output VCARD file') args = parser.parse_args() - + process_vcard(args.input_vcard, args.output_vcard) - print(f"VCARD processed and saved to {args.output_vcard}") \ No newline at end of file + print(f"VCARD processed and saved to {args.output_vcard}") diff --git a/scripts/bruteforce_crypt15.py b/scripts/bruteforce_crypt15.py index 852176c..d4497ce 100644 --- a/scripts/bruteforce_crypt15.py +++ b/scripts/bruteforce_crypt15.py @@ -27,23 +27,24 @@ def _extract_encrypted_key(keyfile): return _generate_hmac_of_hmac(key_stream) -key = open("encrypted_backup.key", "rb").read() -database = open("wa.db.crypt15", "rb").read() -main_key, hex_key = _extract_encrypted_key(key) -for i in range(100): - iv = database[i:i+16] - for j in range(100): - cipher = AES.new(main_key, AES.MODE_GCM, iv) - db_ciphertext = database[j:] - db_compressed = cipher.decrypt(db_ciphertext) - try: - db = zlib.decompress(db_compressed) - except zlib.error: - ... - else: - if db[0:6] == b"SQLite": - print(f"Found!\nIV: {i}\nOffset: {j}") - print(db_compressed[:10]) - exit() +if __name__ == "__main__": + key = open("encrypted_backup.key", "rb").read() + database = open("wa.db.crypt15", "rb").read() + main_key, hex_key = _extract_encrypted_key(key) + for i in range(100): + iv = database[i:i+16] + for j in range(100): + cipher = AES.new(main_key, AES.MODE_GCM, iv) + db_ciphertext = database[j:] + db_compressed = cipher.decrypt(db_ciphertext) + try: + db = zlib.decompress(db_compressed) + except zlib.error: + ... + else: + if db[0:6] == b"SQLite": + print(f"Found!\nIV: {i}\nOffset: {j}") + print(db_compressed[:10]) + exit() -print("Not found! Try to increase maximum search.") + print("Not found! Try to increase maximum search.") diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/contacts.vcf b/tests/data/contacts.vcf new file mode 100644 index 0000000..1e872be --- /dev/null +++ b/tests/data/contacts.vcf @@ -0,0 +1,44 @@ +BEGIN:VCARD +VERSION:3.0 +FN:Sample Contact +TEL;TYPE=CELL:+85288888888 +END:VCARD + +BEGIN:VCARD +VERSION:2.1 +N:Lopez;Yard Lawn Guy;Jose;; +FN:Yard Lawn Guy, Jose Lopez +TEL;HOME:5673334444 +END:VCARD + +BEGIN:VCARD +VERSION:2.1 +N;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:;=4A=6F=68=6E=20=42=75=74=6C=65=72=20=F0=9F=8C=9F= +=F0=9F=92=AB=F0=9F=8C=9F;;; +FN;CHARSET=UTF-8;ENCODING=QUOTED-PRINTABLE:=4A=6F=68=6E=20=42=75=74=6C=65=72=20=F0=9F=8C=9F= +=F0=9F=92=AB=F0=9F=8C=9F +TEL;PREF:5556667777 +END:VCARD + +BEGIN:VCARD +VERSION:2.1 +TEL;WORK;PREF:1234567890 +ORG:Airline Contact #'s +NOTE;ENCODING=QUOTED-PRINTABLE:=53=70=69=72=69=74=20=41=69=72=6C=69= +=6E=65=73=20=38=30=30=2D=37=37=32=2D=37=31=31=37=55=6E=69=74=65=64= +=20=41=69=72=6C=69=6E=65=73=20=38=30=30=2D=32=34=31=2D=36=35=32=32 +END:VCARD + +BEGIN:VCARD +VERSION:2.1 +TEL;WORK;PREF:3451112222 +X-SAMSUNGADR;ENCODING=QUOTED-PRINTABLE:;;=31=31=31=31=32=20=4E=6F=72=74=68=20=45=6C=64=72= +=69=64=67=65=20=50=61=72=6B=77=61=79;=44=61=6C=6C=61=73;=54=58;=32=32=32=32=32 +ORG:James Peacock Elementary +END:VCARD + +BEGIN:VCARD +VERSION:2.1 +TEL;CELL:8889990001 +ORG:AAA Car Service +END:VCARD diff --git a/scripts/brazilian_number_processing_test.py b/tests/test_brazilian_number_processing.py similarity index 95% rename from scripts/brazilian_number_processing_test.py rename to tests/test_brazilian_number_processing.py index 8c6a38b..3612481 100644 --- a/scripts/brazilian_number_processing_test.py +++ b/tests/test_brazilian_number_processing.py @@ -4,13 +4,14 @@ import tempfile import os from unittest.mock import patch -from brazilian_number_processing import process_phone_number, process_vcard +from scripts.brazilian_number_processing import process_phone_number, process_vcard + class TestVCardProcessor(unittest.TestCase): - + def test_process_phone_number(self): """Test the process_phone_number function with various inputs.""" - + # Test cases for 9-digit subscriber numbers test_cases_9_digit = [ # Standard 11-digit number (2 area + 9 subscriber) @@ -30,7 +31,7 @@ class TestVCardProcessor(unittest.TestCase): # With extra non-digit characters ("+55-27-9.1234_5678", "+55 27 91234-5678", "+55 27 1234-5678"), ] - + # Test cases for 8-digit subscriber numbers test_cases_8_digit = [ # Standard 10-digit number (2 area + 8 subscriber) @@ -46,7 +47,7 @@ class TestVCardProcessor(unittest.TestCase): # With country code and trunk zero ("+55 0 27 1234-5678", "+55 27 1234-5678", None), ] - + # Edge cases edge_cases = [ # Too few digits @@ -60,19 +61,19 @@ class TestVCardProcessor(unittest.TestCase): # Unusual formatting but valid number ("(+55) [27] 9.1234_5678", "+55 27 91234-5678", "+55 27 1234-5678"), ] - + # Run tests for all cases all_cases = test_cases_9_digit + test_cases_8_digit + edge_cases - + for raw_phone, expected_orig, expected_mod in all_cases: with self.subTest(raw_phone=raw_phone): orig, mod = process_phone_number(raw_phone) self.assertEqual(orig, expected_orig) self.assertEqual(mod, expected_mod) - + def test_process_vcard(self): """Test the process_vcard function with various VCARD formats.""" - + # Test case 1: Standard TEL entries vcard1 = """BEGIN:VCARD VERSION:3.0 @@ -202,26 +203,26 @@ END:VCARD (vcard5, expected5), (vcard6, expected6) ] - + for i, (input_vcard, expected_output) in enumerate(test_cases): with self.subTest(case=i+1): # Create temporary files for input and output with tempfile.NamedTemporaryFile(mode='w+', delete=False, encoding='utf-8') as input_file: input_file.write(input_vcard) input_path = input_file.name - + output_path = input_path + '.out' - + try: # Process the VCARD process_vcard(input_path, output_path) - + # Read and verify the output with open(output_path, 'r', encoding='utf-8') as output_file: actual_output = output_file.read() self.assertEqual(actual_output, expected_output) - + finally: # Clean up temporary files if os.path.exists(input_path): @@ -231,7 +232,7 @@ END:VCARD def test_script_argument_handling(self): """Test the script's command-line argument handling.""" - + test_input = """BEGIN:VCARD VERSION:3.0 N:Test;User;;; @@ -239,16 +240,17 @@ FN:User Test TEL:+5527912345678 END:VCARD """ - + # Create a temporary input file with tempfile.NamedTemporaryFile(mode='w+', delete=False, encoding='utf-8') as input_file: input_file.write(test_input) input_path = input_file.name - + output_path = input_path + '.out' - + try: - test_args = ['python' if os.name == 'nt' else 'python3', 'brazilian_number_processing.py', input_path, output_path] + test_args = ['python' if os.name == 'nt' else 'python3', + 'scripts/brazilian_number_processing.py', input_path, output_path] # We're just testing that the argument parsing works subprocess.call( test_args, @@ -257,7 +259,7 @@ END:VCARD ) # Check if the output file was created self.assertTrue(os.path.exists(output_path)) - + finally: # Clean up temporary files if os.path.exists(input_path): @@ -265,5 +267,6 @@ END:VCARD if os.path.exists(output_path): os.unlink(output_path) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_exporter.py b/tests/test_exporter.py new file mode 100644 index 0000000..2a38b26 --- /dev/null +++ b/tests/test_exporter.py @@ -0,0 +1,50 @@ +import subprocess +import pytest + + +@pytest.fixture +def command_runner(): + """ + A pytest fixture to simplify running commands. This is a helper + function that you can use in multiple tests. + """ + def _run_command(command_list, check=True): + """ + Runs a command and returns the result. + + Args: + command_list (list): A list of strings representing the command + and its arguments (e.g., ["python", "my_script.py", "arg1"]). + check (bool, optional): If True, raise an exception if the + command returns a non-zero exit code. Defaults to True. + + Returns: + subprocess.CompletedProcess: The result of the command. + """ + return subprocess.run( + command_list, + capture_output=True, + text=True, + check=check, + ) + return _run_command + + +def test_sanity_check(command_runner): + """ + This is a basic sanity check to make sure all modules can be imported + This runs the exporter without any arguments. It should fail with a + message about missing arguments. + """ + result = command_runner(["wtsexporter"], False) + expected_stderr = "You must define the device type" + assert expected_stderr in result.stderr, f"STDERR was: {result.stderr}" + assert result.returncode == 2 + + +def test_android(command_runner): + ... + + +def test_ios(command_runner): + ... diff --git a/tests/test_incremental_merge.py b/tests/test_incremental_merge.py new file mode 100644 index 0000000..527e5ae --- /dev/null +++ b/tests/test_incremental_merge.py @@ -0,0 +1,341 @@ +import os +import json +import pytest +from unittest.mock import patch, mock_open, call, MagicMock +from Whatsapp_Chat_Exporter.utility import incremental_merge +from Whatsapp_Chat_Exporter.data_model import ChatStore + +# Test data setup +BASE_PATH = "AppDomainGroup-group.net.whatsapp.WhatsApp.shared" +chat_data_1 = { + "12345678@s.whatsapp.net": { + "name": "Friend", + "type": "ios", + "my_avatar": os.path.join(BASE_PATH, "Media", "Profile", "Photo.jpg"), + "their_avatar": os.path.join(BASE_PATH, "Media", "Profile", "12345678-1709851420.thumb"), + "their_avatar_thumb": None, + "status": None, + "messages": { + "24690": { + "from_me": True, + "timestamp": 1463926635.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B7E", + "meta": False, + "data": "I'm here", + "safe": False, + "sticker": False + }, + "24691": { # This message only exists in target + "from_me": False, + "timestamp": 1463926641.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B8E", + "meta": False, + "data": "Great to see you", + "safe": False, + "sticker": False + } + } + } +} + +chat_data_2 = { + "12345678@s.whatsapp.net": { + "name": "Friend", + "type": "ios", + "my_avatar": os.path.join(BASE_PATH, "Media", "Profile", "Photo.jpg"), + "their_avatar": os.path.join(BASE_PATH, "Media", "Profile", "12345678-1709851420.thumb"), + "their_avatar_thumb": None, + "status": None, + "messages": { + "24690": { + "from_me": True, + "timestamp": 1463926635.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B7E", + "meta": False, + "data": "I'm here", + "safe": False, + "sticker": False + }, + "24692": { # This message only exists in source + "from_me": False, + "timestamp": 1463926642.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B9E", + "meta": False, + "data": "Hi there!", + "safe": False, + "sticker": False + }, + } + } +} + +# Expected merged data - should contain all messages with all fields initialized as they would be by Message class +chat_data_merged = { + "12345678@s.whatsapp.net": { + "name": "Friend", + "type": "ios", + "my_avatar": os.path.join(BASE_PATH, "Media", "Profile", "Photo.jpg"), + "their_avatar": os.path.join(BASE_PATH, "Media", "Profile", "12345678-1709851420.thumb"), + "their_avatar_thumb": None, + "status": None, + "media_base": "", + "messages": { + "24690": { + "from_me": True, + "timestamp": 1463926635.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B7E", + "meta": False, + "data": "I'm here", + "sender": None, + "safe": False, + "mime": None, + "reply": None, + "quoted_data": None, + "caption": None, + "thumb": None, + "sticker": False, + "message_type": None, + "received_timestamp": None, + "read_timestamp": None + }, + "24691": { + "from_me": False, + "timestamp": 1463926641.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B8E", + "meta": False, + "data": "Great to see you", + "sender": None, + "safe": False, + "mime": None, + "reply": None, + "quoted_data": None, + "caption": None, + "thumb": None, + "sticker": False, + "message_type": None, + "received_timestamp": None, + "read_timestamp": None + }, + "24692": { + "from_me": False, + "timestamp": 1463926642.571629, + "time": "10:17", + "media": False, + "key_id": "34B5EF10FBCA37B9E", + "meta": False, + "data": "Hi there!", + "sender": None, + "safe": False, + "mime": None, + "reply": None, + "quoted_data": None, + "caption": None, + "thumb": None, + "sticker": False, + "message_type": None, + "received_timestamp": None, + "read_timestamp": None + }, + } + } +} + + +@pytest.fixture +def mock_filesystem(): + with ( + patch("os.path.exists") as mock_exists, + patch("os.makedirs") as mock_makedirs, + patch("os.path.getmtime") as mock_getmtime, + patch("os.listdir") as mock_listdir, + patch("os.walk") as mock_walk, + patch("shutil.copy2") as mock_copy2, + ): + yield { + "exists": mock_exists, + "makedirs": mock_makedirs, + "getmtime": mock_getmtime, + "listdir": mock_listdir, + "walk": mock_walk, + "copy2": mock_copy2, + } + + +def test_incremental_merge_new_file(mock_filesystem): + """Test merging when target file doesn't exist""" + source_dir = "/source" + target_dir = "/target" + media_dir = "media" + + # Setup mock filesystem + mock_filesystem["exists"].side_effect = lambda x: x == "/source" + mock_filesystem["listdir"].return_value = ["chat.json"] + + # Run the function + incremental_merge(source_dir, target_dir, media_dir, 2, True) + + # Verify the operations + mock_filesystem["makedirs"].assert_called_once_with(target_dir, exist_ok=True) + mock_filesystem["copy2"].assert_called_once_with( + os.path.join(source_dir, "chat.json"), + os.path.join(target_dir, "chat.json") + ) + + +def test_incremental_merge_existing_file_with_changes(mock_filesystem): + """Test merging when target file exists and has changes""" + source_dir = "source" + target_dir = "target" + media_dir = "media" + + # Setup mock filesystem + mock_filesystem["exists"].side_effect = lambda x: True + mock_filesystem["listdir"].return_value = ["chat.json"] + + # Mock file operations with consistent path separators + source_file = os.path.join(source_dir, "chat.json") + target_file = os.path.join(target_dir, "chat.json") + mock_file_content = { + source_file: json.dumps(chat_data_2), + target_file: json.dumps(chat_data_1), + } + + written_chunks = [] + + def mock_file_write(data): + written_chunks.append(data) + + mock_write = MagicMock(side_effect=mock_file_write) + + with patch("builtins.open", mock_open()) as mock_file: + def mock_file_read(filename, mode="r"): + if mode == 'w': + file_mock = mock_open().return_value + file_mock.write.side_effect = mock_write + return file_mock + else: + # Use normalized path for lookup + norm_filename = os.path.normpath(filename) + content = mock_file_content.get(norm_filename, '') + file_mock = mock_open(read_data=content).return_value + return file_mock + + mock_file.side_effect = mock_file_read + + # Run the function + incremental_merge(source_dir, target_dir, media_dir, 2, True) + + # Verify file operations using os.path.join + mock_file.assert_any_call(source_file, "r") + mock_file.assert_any_call(target_file, "r") + mock_file.assert_any_call(target_file, "w") + + # Rest of verification code... + assert mock_write.called, "Write method was never called" + written_data = json.loads(''.join(written_chunks)) + assert written_data is not None, "No data was written" + assert written_data == chat_data_merged, "Merged data does not match expected result" + + messages = written_data["12345678@s.whatsapp.net"]["messages"] + assert "24690" in messages, "Common message should be present" + assert "24691" in messages, "Target-only message should be preserved" + assert "24692" in messages, "Source-only message should be added" + assert len(messages) == 3, "Should have exactly 3 messages" + + +def test_incremental_merge_existing_file_no_changes(mock_filesystem): + """Test merging when target file exists but has no changes""" + source_dir = "source" + target_dir = "target" + media_dir = "media" + + # Setup mock filesystem + mock_filesystem["exists"].side_effect = lambda x: True + mock_filesystem["listdir"].return_value = ["chat.json"] + + # Mock file operations with consistent path separators + source_file = os.path.join(source_dir, "chat.json") + target_file = os.path.join(target_dir, "chat.json") + mock_file_content = { + source_file: json.dumps(chat_data_1), + target_file: json.dumps(chat_data_1), + } + + with patch("builtins.open", mock_open()) as mock_file: + def mock_file_read(filename, mode="r"): + if mode == 'w': + file_mock = mock_open().return_value + return file_mock + else: + # Use normalized path for lookup + norm_filename = os.path.normpath(filename) + content = mock_file_content.get(norm_filename, '') + file_mock = mock_open(read_data=content).return_value + return file_mock + + mock_file.side_effect = mock_file_read + + # Run the function + incremental_merge(source_dir, target_dir, media_dir, 2, True) + + # Verify no write operations occurred on target file + write_calls = [ + call for call in mock_file.mock_calls if call[0] == "().write"] + assert len(write_calls) == 0 + + +def test_incremental_merge_media_copy(mock_filesystem): + """Test media file copying during merge""" + source_dir = "source" + target_dir = "target" + media_dir = "media" + + # Setup mock filesystem + mock_filesystem["exists"].side_effect = lambda x: True + mock_filesystem["listdir"].return_value = ["chat.json"] + mock_filesystem["walk"].return_value = [ + (os.path.join(source_dir, "media"), ["subfolder"], ["file1.jpg"]), + (os.path.join(source_dir, "media", "subfolder"), [], ["file2.jpg"]), + ] + mock_filesystem["getmtime"].side_effect = lambda x: 1000 if "source" in x else 500 + + # Mock file operations with consistent path separators + source_file = os.path.join(source_dir, "chat.json") + target_file = os.path.join(target_dir, "chat.json") + mock_file_content = { + source_file: json.dumps(chat_data_1), + target_file: json.dumps(chat_data_1), + } + + with patch("builtins.open", mock_open()) as mock_file: + def mock_file_read(filename, mode="r"): + if mode == 'w': + file_mock = mock_open().return_value + return file_mock + else: + # Use normalized path for lookup + norm_filename = os.path.normpath(filename) + content = mock_file_content.get(norm_filename, '') + file_mock = mock_open(read_data=content).return_value + return file_mock + + mock_file.side_effect = mock_file_read + + # Run the function + incremental_merge(source_dir, target_dir, media_dir, 2, True) + + # Verify media file operations + assert mock_filesystem["makedirs"].call_count >= 2 # At least target dir and media dir + assert mock_filesystem["copy2"].call_count == 2 # Two media files copied diff --git a/tests/test_nuitka_binary.py b/tests/test_nuitka_binary.py new file mode 100644 index 0000000..0334c21 --- /dev/null +++ b/tests/test_nuitka_binary.py @@ -0,0 +1,76 @@ +import os +import sys +import pytest +import subprocess + + +@pytest.fixture +def command_runner(): + """ + A pytest fixture to simplify running commands. This is a helper + function that you can use in multiple tests. + """ + def _run_command(command_list, check=True): + """ + Runs a command and returns the result. + + Args: + command_list (list): A list of strings representing the command + and its arguments (e.g., ["python", "my_script.py", "arg1"]). + check (bool, optional): If True, raise an exception if the + command returns a non-zero exit code. Defaults to True. + + Returns: + subprocess.CompletedProcess: The result of the command. + """ + return subprocess.run( + command_list, + capture_output=True, + text=True, + check=check, + ) + return _run_command + + +def test_nuitka_binary(): + """ + Tests the creation and execution of a Nuitka-compiled binary. + """ + + if sys.version_info >= (3, 14): + print("Skipping Nuitka test: Python 3.14 is not yet fully supported by Nuitka.") + return + + nuitka_command = [ + "python", "-m", "nuitka", "--onefile", "--assume-yes-for-downloads", + "--include-data-file=./Whatsapp_Chat_Exporter/whatsapp.html=./Whatsapp_Chat_Exporter/whatsapp.html", + "Whatsapp_Chat_Exporter", + "--output-filename=wtsexporter.exe" # use .exe on all platforms for compatibility + ] + + compile_result = subprocess.run( + nuitka_command, + capture_output=True, + text=True, + check=True + ) + print(f"Nuitka compilation output: {compile_result.stdout}") + + binary_path = "./wtsexporter.exe" + assert os.path.exists(binary_path), f"Binary {binary_path} was not created." + + try: + execute_result = subprocess.run( + [binary_path, "--help"], + capture_output=True, + text=True, + check=True, + ) + print(f"Binary execution output: {execute_result.stdout}") + assert "usage:" in execute_result.stdout.lower(), "Binary did not produce expected help output." + except subprocess.CalledProcessError as e: + print(f"Binary execution failed with error: {e.stderr}") + raise + finally: + if os.path.exists(binary_path): + os.remove(binary_path) diff --git a/tests/test_utility.py b/tests/test_utility.py new file mode 100644 index 0000000..8ea2af4 --- /dev/null +++ b/tests/test_utility.py @@ -0,0 +1,256 @@ +import pytest +import random +import string +from unittest.mock import patch, mock_open, MagicMock +from Whatsapp_Chat_Exporter.utility import * + + +def test_convert_time_unit(): + assert convert_time_unit(0) == "less than a second" + assert convert_time_unit(1) == "a second" + assert convert_time_unit(10) == "10 seconds" + assert convert_time_unit(60) == "1 minute" + assert convert_time_unit(61) == "1 minute 1 second" + assert convert_time_unit(122) == "2 minutes 2 seconds" + assert convert_time_unit(3600) == "1 hour" + assert convert_time_unit(3661) == "1 hour 1 minute 1 second" + assert convert_time_unit(3720) == "1 hour 2 minutes" + assert convert_time_unit(3660) == "1 hour 1 minute" + assert convert_time_unit(7263) == "2 hours 1 minute 3 seconds" + assert convert_time_unit(86400) == "1 day" + assert convert_time_unit(86461) == "1 day 1 minute 1 second" + assert convert_time_unit(172805) == "2 days 5 seconds" + + +class TestBytesToReadable: + assert bytes_to_readable(0) == "0 B" + assert bytes_to_readable(500) == "500 B" + assert bytes_to_readable(1024) == "1.0 KB" + assert bytes_to_readable(2048) == "2.0 KB" + assert bytes_to_readable(1536) == "1.5 KB" + assert bytes_to_readable(1024**2) == "1.0 MB" + assert bytes_to_readable(5 * 1024**2) == "5.0 MB" + assert bytes_to_readable(1024**3) == "1.0 GB" + assert bytes_to_readable(1024**4) == "1.0 TB" + assert bytes_to_readable(1024**5) == "1.0 PB" + assert bytes_to_readable(1024**6) == "1.0 EB" + assert bytes_to_readable(1024**7) == "1.0 ZB" + assert bytes_to_readable(1024**8) == "1.0 YB" + + +class TestReadableToBytes: + def test_conversion(self): + assert readable_to_bytes("0B") == 0 + assert readable_to_bytes("100B") == 100 + assert readable_to_bytes("50 B") == 50 + assert readable_to_bytes("1KB") == 1024 + assert readable_to_bytes("2.5 KB") == 2560 + assert readable_to_bytes("2.0 KB") == 2048 + assert readable_to_bytes("1MB") == 1024**2 + assert readable_to_bytes("0.5 MB") == 524288 + assert readable_to_bytes("1. MB") == 1048576 + assert readable_to_bytes("1GB") == 1024**3 + assert readable_to_bytes("1.GB") == 1024**3 + assert readable_to_bytes("1TB") == 1024**4 + assert readable_to_bytes("1PB") == 1024**5 + assert readable_to_bytes("1EB") == 1024**6 + assert readable_to_bytes("1ZB") == 1024**7 + assert readable_to_bytes("1YB") == 1024**8 + + def test_case_insensitivity(self): + assert readable_to_bytes("1kb") == 1024 + assert readable_to_bytes("2mB") == 2 * 1024**2 + + def test_whitespace(self): + assert readable_to_bytes(" 10 KB ") == 10 * 1024 + assert readable_to_bytes(" 1 MB") == 1024**2 + + def test_invalid_unit(self): + with pytest.raises(ValueError, match="Invalid size format for size_str"): + readable_to_bytes("100X") + readable_to_bytes("A100") + readable_to_bytes("100$$$$$") + + def test_invalid_number(self): + with pytest.raises(ValueError, match="Invalid size format for size_str"): + readable_to_bytes("ABC KB") + + def test_missing_unit(self): + assert readable_to_bytes("100") == 100 + + +class TestSanitizeExcept: + def test_no_tags(self): + html = "This is plain text." + assert sanitize_except(html) == Markup("This is plain text.") + + def test_allowed_br_tag(self): + html = "Line 1
Line 2" + assert sanitize_except(html) == Markup("Line 1
Line 2") + html = "
Line" + assert sanitize_except(html) == Markup("
Line") + html = "Line
" + assert sanitize_except(html) == Markup("Line
") + + def test_mixed_tags(self): + html = "Bold
Italic" + assert sanitize_except(html) == Markup( + "<b>Bold</b>
<i>Italic</i><img src='evil.gif'><script>alert('XSS')</script>") + + def test_attribute_stripping(self): + html = "
" + assert sanitize_except(html) == Markup("
") + + +class TestDetermineDay: + def test_same_day(self): + timestamp1 = 1678838400 # March 15, 2023 00:00:00 GMT + timestamp2 = 1678881600 # March 15, 2023 12:00:00 GMT + assert determine_day(timestamp1, timestamp2) is None + + def test_different_day(self): + timestamp1 = 1678886400 # March 15, 2023 00:00:00 GMT + timestamp2 = 1678972800 # March 16, 2023 00:00:00 GMT + assert determine_day(timestamp1, timestamp2) == datetime(2023, 3, 16).date() + + def test_crossing_month(self): + timestamp1 = 1680220800 # March 31, 2023 00:00:00 GMT + timestamp2 = 1680307200 # April 1, 2023 00:00:00 GMT + assert determine_day(timestamp1, timestamp2) == datetime(2023, 4, 1).date() + + def test_crossing_year(self): + timestamp1 = 1703980800 # December 31, 2023 00:00:00 GMT + timestamp2 = 1704067200 # January 1, 2024 00:00:00 GMT + assert determine_day(timestamp1, timestamp2) == datetime(2024, 1, 1).date() + + +class TestGetFileName: + def test_valid_contact_phone_number_no_chat_name(self): + chat = ChatStore(Device.ANDROID, name=None) + filename, name = get_file_name("1234567890@s.whatsapp.net", chat) + assert filename == "1234567890" + assert name == "1234567890" + + def test_valid_contact_phone_number_with_chat_name(self): + chat = ChatStore(Device.IOS, name="My Chat Group") + filename, name = get_file_name("1234567890@s.whatsapp.net", chat) + assert filename == "1234567890-My-Chat-Group" + assert name == "My Chat Group" + + def test_valid_contact_exported_chat(self): + chat = ChatStore(Device.ANDROID, name="Testing") + filename, name = get_file_name("ExportedChat", chat) + assert filename == "ExportedChat-Testing" + assert name == "Testing" + + def test_valid_contact_special_ids(self): + chat = ChatStore(Device.ANDROID, name="Special Chat") + filename_000, name_000 = get_file_name("000000000000000", chat) + assert filename_000 == "000000000000000-Special-Chat" + assert name_000 == "Special Chat" + filename_001, name_001 = get_file_name("000000000000001", chat) + assert filename_001 == "000000000000001-Special-Chat" + assert name_001 == "Special Chat" + + def test_unexpected_contact_format(self): + chat = ChatStore(Device.ANDROID, name="Some Chat") + with pytest.raises(ValueError, match="Unexpected contact format: invalid-contact"): + get_file_name("invalid-contact", chat) + + def test_contact_with_hyphen_and_chat_name(self): + chat = ChatStore(Device.ANDROID, name="Another Chat") + filename, name = get_file_name("123-456-7890@g.us", chat) + assert filename == "Another-Chat" + assert name == "Another Chat" + + def test_contact_with_hyphen_no_chat_name(self): + chat = ChatStore(Device.ANDROID, name=None) + filename, name = get_file_name("123-456-7890@g.us", chat) + assert filename == "123-456-7890" + assert name == "123-456-7890" + + +class TestGetCondForEmpty: + def test_enable_true(self): + condition = get_cond_for_empty(True, "c.jid", "c.broadcast") + assert condition == "AND (chat.hidden=0 OR c.jid='status@broadcast' OR c.broadcast>0)" + + def test_enable_false(self): + condition = get_cond_for_empty(False, "other_jid", "other_broadcast") + assert condition == "" + + +class TestGetChatCondition: + ... + + +class TestGetStatusLocation: + @patch('os.path.isdir') + @patch('os.path.isfile') + @patch('os.mkdir') + @patch('urllib.request.urlopen') + @patch('builtins.open', new_callable=mock_open) + def test_offline_static_set(self, mock_open_file, mock_urlopen, mock_mkdir, mock_isfile, mock_isdir): + mock_isdir.return_value = False + mock_isfile.return_value = False + mock_response = MagicMock() + mock_response.read.return_value = b'W3.CSS Content' + mock_urlopen.return_value.__enter__.return_value = mock_response + output_folder = "output_folder" + offline_static = "offline_static" + + result = get_status_location(output_folder, offline_static) + + assert result == os.path.join(offline_static, "w3.css") + mock_mkdir.assert_called_once_with(os.path.join(output_folder, offline_static)) + mock_urlopen.assert_called_once_with("https://www.w3schools.com/w3css/4/w3.css") + mock_open_file.assert_called_once_with(os.path.join(output_folder, offline_static, "w3.css"), "wb") + mock_open_file().write.assert_called_once_with(b'W3.CSS Content') + + def test_offline_static_not_set(self): + result = get_status_location("output_folder", "") + assert result == "https://www.w3schools.com/w3css/4/w3.css" + + +class TestSafeName: + def generate_random_string(length=50): + random.seed(10) + return ''.join(random.choice(string.ascii_letters + string.digits + "äöüß") for _ in range(length)) + + safe_name_test_cases = [ + ("This is a test string", "This-is-a-test-string"), + ("This is a test string with special characters!@#$%^&*()", + "This-is-a-test-string-with-special-characters"), + ("This is a test string with numbers 1234567890", "This-is-a-test-string-with-numbers-1234567890"), + ("This is a test string with mixed case ThisIsATestString", + "This-is-a-test-string-with-mixed-case-ThisIsATestString"), + ("This is a test string with extra spaces \u00A0 \u00A0 \u00A0 ThisIsATestString", + "This-is-a-test-string-with-extra-spaces-ThisIsATestString"), + ("This is a test string with unicode characters äöüß", + "This-is-a-test-string-with-unicode-characters-äöüß"), + ("這是一個包含中文的測試字符串", "這是一個包含中文的測試字符串"), # Chinese characters, should stay as is + ( + f"This is a test string with long length {generate_random_string(1000)}", + f"This-is-a-test-string-with-long-length-{generate_random_string(1000)}", + ), + ("", ""), # Empty string + (" ", ""), # String with only space + ("---", "---"), # String with only hyphens + ("___", "___"), # String with only underscores + ("a" * 100, "a" * 100), # Long string with single character + ("a-b-c-d-e", "a-b-c-d-e"), # String with hyphen + ("a_b_c_d_e", "a_b_c_d_e"), # String with underscore + ("a b c d e", "a-b-c-d-e"), # String with spaces + ("test.com/path/to/resource?param1=value1¶m2=value2", + "test.compathtoresourceparam1value1param2value2"), # Test with URL + ("filename.txt", "filename.txt"), # Test with filename + ("Αυτή είναι μια δοκιμαστική συμβολοσειρά με ελληνικούς χαρακτήρες.", + "Αυτή-είναι-μια-δοκιμαστική-συμβολοσειρά-με-ελληνικούς-χαρακτήρες."), # Greek characters + ("This is a test with комбинированные знаки ̆ example", + "This-is-a-test-with-комбинированные-знаки-example") # Mixed with unicode + ] + + @pytest.mark.parametrize("input_text, expected_output", safe_name_test_cases) + def test_safe_name(self, input_text, expected_output): + result = safe_name(input_text) + assert result == expected_output diff --git a/tests/test_vcards_contacts.py b/tests/test_vcards_contacts.py new file mode 100644 index 0000000..2eac4d9 --- /dev/null +++ b/tests/test_vcards_contacts.py @@ -0,0 +1,48 @@ +# from contacts_names_from_vcards import readVCardsFile + +import os +from Whatsapp_Chat_Exporter.vcards_contacts import normalize_number, read_vcards_file + + +def test_readVCardsFile(): + data_dir = os.path.join(os.path.dirname(__file__), "data") + data = read_vcards_file(os.path.join(data_dir, "contacts.vcf"), "852") + if data: + print("Found Names") + print("-----------------------") + for count, contact_tuple in enumerate(data, start=1): + # The name is the second element of the tuple (at index 1) + name = contact_tuple[1] + + # Print the count and the name + print(f"{count}. {name}") + print(data) + assert len(data) == 6 + # Test simple contact name + assert data[0][1] == "Sample Contact" + # Test complex name + assert data[1][1] == "Yard Lawn Guy, Jose Lopez" + # Test name with emoji + assert data[2][1] == "John Butler 🌟💫🌟" + # Test note with multi-line encoding + assert data[3][1] == "Airline Contact #'s" + # Test address with multi-line encoding + assert data[4][1] == "James Peacock Elementary" + # Test business entry using ORG but not F/FN + assert data[5][1] == "AAA Car Service" + + +def test_create_number_to_name_dicts(): + pass + + +def test_fuzzy_match_numbers(): + pass + + +def test_normalize_number(): + assert normalize_number('0531234567', '1') == '1531234567' + assert normalize_number('001531234567', '2') == '1531234567' + assert normalize_number('+1531234567', '34') == '1531234567' + assert normalize_number('053(123)4567', '34') == '34531234567' + assert normalize_number('0531-234-567', '58') == '58531234567'