diff --git a/README.md b/README.md index 04af342..04d0427 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,7 @@ Do an iPhone/iPad Backup with iTunes/Finder first. If you want to work on an encrypted iOS/iPadOS Backup, you should install iphone_backup_decrypt from [KnugiHK/iphone_backup_decrypt](https://github.com/KnugiHK/iphone_backup_decrypt) before you run the extract_iphone_media.py. ```sh -pip install git+https://github.com/KnugiHK/iphone_backup_decrypt +pip install whatsapp-chat-exporter["ios_backup"] ``` > [!NOTE] > You will need to disable the built-in end-to-end encryption for WhatsApp backups. See [WhatsApp's FAQ](https://faq.whatsapp.com/490592613091019#turn-off-end-to-end-encrypted-backup) for how to do it. diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index 07a341b..3f45365 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -11,14 +11,15 @@ import logging import importlib.metadata from Whatsapp_Chat_Exporter import android_crypt, exported_handler, android_handler from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler -from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, Crypt, check_update +from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore, Timing +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, Crypt from Whatsapp_Chat_Exporter.utility import readable_to_bytes, safe_name, bytes_to_readable -from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, DbType -from Whatsapp_Chat_Exporter.utility import telegram_json_format +from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, check_update +from Whatsapp_Chat_Exporter.utility import telegram_json_format, convert_time_unit, DbType from argparse import ArgumentParser, SUPPRESS from datetime import datetime from getpass import getpass +from tqdm import tqdm from sys import exit from typing import Optional, List, Dict from Whatsapp_Chat_Exporter.vcards_contacts import ContactsFromVCards @@ -286,13 +287,17 @@ def setup_argument_parser() -> ArgumentParser: help="Specify the chunk size for decrypting iOS backup, which may affect the decryption speed." ) misc_group.add_argument( - "--max-bruteforce-worker", dest="max_bruteforce_worker", default=10, type=int, + "--max-bruteforce-worker", dest="max_bruteforce_worker", default=4, type=int, help="Specify the maximum number of worker for bruteforce decryption." ) misc_group.add_argument( "--no-banner", dest="no_banner", default=False, action='store_true', help="Do not show the banner" ) + misc_group.add_argument( + "--fix-dot-files", dest="fix_dot_files", default=False, action='store_true', + help="Fix files with a dot at the end of their name (allowing the outputs be stored in FAT filesystems)" + ) return parser @@ -537,6 +542,7 @@ def process_messages(args, data: ChatCollection) -> None: exit(6) filter_chat = (args.filter_chat_include, args.filter_chat_exclude) + timing = Timing(args.timezone_offset if args.timezone_offset else CURRENT_TZ_OFFSET) with sqlite3.connect(msg_db) as db: db.row_factory = sqlite3.Row @@ -548,14 +554,14 @@ def process_messages(args, data: ChatCollection) -> None: message_handler = ios_handler message_handler.messages( - db, data, args.media, args.timezone_offset, args.filter_date, + db, data, args.media, timing, args.filter_date, filter_chat, args.filter_empty, args.no_reply_ios ) # Process media message_handler.media( db, data, args.media, args.filter_date, - filter_chat, args.filter_empty, args.separate_media + filter_chat, args.filter_empty, args.separate_media, args.fix_dot_files ) # Process vcards @@ -565,17 +571,17 @@ def process_messages(args, data: ChatCollection) -> None: ) # Process calls - process_calls(args, db, data, filter_chat) + process_calls(args, db, data, filter_chat, timing) -def process_calls(args, db, data: ChatCollection, filter_chat) -> None: +def process_calls(args, db, data: ChatCollection, filter_chat, timing) -> None: """Process call history if available.""" if args.android: - android_handler.calls(db, data, args.timezone_offset, filter_chat) + android_handler.calls(db, data, timing, filter_chat) elif args.ios and args.call_db_ios is not None: with sqlite3.connect(args.call_db_ios) as cdb: cdb.row_factory = sqlite3.Row - ios_handler.calls(cdb, data, args.timezone_offset, filter_chat) + ios_handler.calls(cdb, data, timing, filter_chat) def handle_media_directory(args) -> None: @@ -665,24 +671,27 @@ def export_multiple_json(args, data: Dict) -> None: # Export each chat total = len(data.keys()) - for index, jik in enumerate(data.keys()): - if data[jik]["name"] is not None: - contact = data[jik]["name"].replace('/', '') - else: - contact = jik.replace('+', '') + with tqdm(total=total, desc="Generating JSON files", unit="file", leave=False) as pbar: + for jik in data.keys(): + if data[jik]["name"] is not None: + contact = data[jik]["name"].replace('/', '') + else: + contact = jik.replace('+', '') - if args.telegram: - messages = telegram_json_format(jik, data[jik], args.timezone_offset) - else: - messages = {jik: data[jik]} - with open(f"{json_path}/{safe_name(contact)}.json", "w") as f: - file_content = json.dumps( - messages, - ensure_ascii=not args.avoid_encoding_json, - indent=args.pretty_print_json - ) - f.write(file_content) - logger.info(f"Writing JSON file...({index + 1}/{total})\r") + if args.telegram: + messages = telegram_json_format(jik, data[jik], args.timezone_offset) + else: + messages = {jik: data[jik]} + with open(f"{json_path}/{safe_name(contact)}.json", "w") as f: + file_content = json.dumps( + messages, + ensure_ascii=not args.avoid_encoding_json, + indent=args.pretty_print_json + ) + f.write(file_content) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Generated {total} JSON files in {convert_time_unit(total_time)}{CLEAR_LINE}") def process_exported_chat(args, data: ChatCollection) -> None: diff --git a/Whatsapp_Chat_Exporter/android_crypt.py b/Whatsapp_Chat_Exporter/android_crypt.py index 3e921d1..54a4e09 100644 --- a/Whatsapp_Chat_Exporter/android_crypt.py +++ b/Whatsapp_Chat_Exporter/android_crypt.py @@ -1,13 +1,12 @@ -import time import hmac import io import logging -import threading import zlib import concurrent.futures +from tqdm import tqdm from typing import Tuple, Union from hashlib import sha256 -from sys import exit +from functools import partial from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CRYPT14_OFFSETS, Crypt, DbType try: @@ -112,13 +111,36 @@ def _decrypt_database(db_ciphertext: bytes, main_key: bytes, iv: bytes) -> bytes zlib.error: If decompression fails. ValueError: if the plaintext is not a SQLite database. """ + FOOTER_SIZE = 32 + if len(db_ciphertext) <= FOOTER_SIZE: + raise ValueError("Input data too short to contain a valid GCM tag.") + + actual_ciphertext = db_ciphertext[:-FOOTER_SIZE] + tag = db_ciphertext[-FOOTER_SIZE: -FOOTER_SIZE + 16] + cipher = AES.new(main_key, AES.MODE_GCM, iv) - db_compressed = cipher.decrypt(db_ciphertext) - db = zlib.decompress(db_compressed) - if db[0:6].upper() != b"SQLITE": + try: + db_compressed = cipher.decrypt_and_verify(actual_ciphertext, tag) + except ValueError: + # This could be key, IV, or tag is wrong, but likely the key is wrong. + raise ValueError("Decryption/Authentication failed. Ensure you are using the correct key.") + + if len(db_compressed) < 2 or db_compressed[0] != 0x78: + logger.debug(f"Data passes GCM but is not Zlib. Header: {db_compressed[:2].hex()}") raise ValueError( - "The plaintext is not a SQLite database. Ensure you are using the correct key." + "Key is correct, but decrypted data is not a valid compressed stream. " + "Is this even a valid WhatsApp database backup?" ) + + try: + db = zlib.decompress(db_compressed) + except zlib.error as e: + raise zlib.error(f"Decompression failed (The backup file likely corrupted at source): {e}") + + if not db.startswith(b"SQLite"): + raise ValueError( + "Data is valid and decompressed, but it is not a SQLite database. " + "Is this even a valid WhatsApp database backup?") return db @@ -142,82 +164,69 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) -> # Attempt known offsets first for offsets in CRYPT14_OFFSETS: - iv = database[offsets["iv"]:offsets["iv"] + 16] - db_ciphertext = database[offsets["db"]:] + iv = offsets["iv"] + db = offsets["db"] try: - decrypted_db = _decrypt_database(db_ciphertext, main_key, iv) + decrypted_db = _attempt_decrypt_task((iv, iv + 16, db), database, main_key) except (zlib.error, ValueError): - pass # Try next offset + continue else: logger.debug( - f"Decryption successful with known offsets: IV {offsets['iv']}, DB {offsets['db']}{CLEAR_LINE}" + f"Decryption successful with known offsets: IV {iv}, DB {db}{CLEAR_LINE}" ) return decrypted_db # Successful decryption - def animate_message(stop_event): - base_msg = "Common offsets failed. Initiating brute-force with multithreading" - dots = ["", ".", "..", "..."] - i = 0 - while not stop_event.is_set(): - logger.info(f"{base_msg}{dots[i % len(dots)]}\x1b[K\r") - time.sleep(0.3) - i += 1 - logger.info(f"Common offsets failed but brute-forcing the offset works!{CLEAR_LINE}") - - stop_event = threading.Event() - anim_thread = threading.Thread(target=animate_message, args=(stop_event,)) - anim_thread.start() - - # Convert brute force generator into a list for parallel processing - offset_combinations = list(brute_force_offset()) - - def attempt_decrypt(offset_tuple): - """Attempt decryption with the given offsets.""" - start_iv, end_iv, start_db = offset_tuple - iv = database[start_iv:end_iv] - db_ciphertext = database[start_db:] - logger.debug(""f"Trying offsets: IV {start_iv}-{end_iv}, DB {start_db}{CLEAR_LINE}") - - try: - db = _decrypt_database(db_ciphertext, main_key, iv) - except (zlib.error, ValueError): - return None # Decryption failed, move to next - else: - stop_event.set() - anim_thread.join() - logger.info( - f"The offsets of your IV and database are {start_iv} and " - f"{start_db}, respectively. To include your offsets in the " - "program, please report it by creating an issue on GitHub: " - "https://github.com/KnugiHK/Whatsapp-Chat-Exporter/discussions/47" - f"\nShutting down other threads...{CLEAR_LINE}" - ) - return db - - with concurrent.futures.ThreadPoolExecutor(max_worker) as executor: - future_to_offset = {executor.submit(attempt_decrypt, offset) - : offset for offset in offset_combinations} - - try: - for future in concurrent.futures.as_completed(future_to_offset): - result = future.result() - if result is not None: - # Shutdown remaining threads + logger.info(f"Common offsets failed. Will attempt to brute-force{CLEAR_LINE}") + offset_max = 200 + workers = max_worker + check_offset = partial(_attempt_decrypt_task, database=database, main_key=main_key) + all_offsets = list(brute_force_offset(offset_max, offset_max)) + executor = concurrent.futures.ProcessPoolExecutor(max_workers=workers) + try: + with tqdm(total=len(all_offsets), desc="Brute-forcing offsets", unit="trial", leave=False) as pbar: + results = executor.map(check_offset, all_offsets, chunksize=8) + found = False + for offset_info, result in zip(all_offsets, results): + pbar.update(1) + if result: + start_iv, _, start_db = offset_info + # Clean shutdown on success executor.shutdown(wait=False, cancel_futures=True) - return result + found = True + break + if found: + logger.info( + f"The offsets of your IV and database are {start_iv} and {start_db}, respectively.{CLEAR_LINE}" + ) + logger.info( + f"To include your offsets in the expoter, please report it in the discussion thread on GitHub:{CLEAR_LINE}" + ) + logger.info(f"https://github.com/KnugiHK/Whatsapp-Chat-Exporter/discussions/47{CLEAR_LINE}") + return result - except KeyboardInterrupt: - stop_event.set() - anim_thread.join() - logger.info(f"Brute force interrupted by user (Ctrl+C). Shutting down gracefully...{CLEAR_LINE}") - executor.shutdown(wait=False, cancel_futures=True) - exit(1) - finally: - stop_event.set() - anim_thread.join() + except KeyboardInterrupt: + executor.shutdown(wait=False, cancel_futures=True) + print("\n") + raise KeyboardInterrupt( + f"Brute force interrupted by user (Ctrl+C). Shutting down gracefully...{CLEAR_LINE}" + ) + + finally: + executor.shutdown(wait=False) raise OffsetNotFoundError("Could not find the correct offsets for decryption.") +def _attempt_decrypt_task(offset_tuple, database, main_key): + """Attempt decryption with the given offsets.""" + start_iv, end_iv, start_db = offset_tuple + iv = database[start_iv:end_iv] + db_ciphertext = database[start_db:] + + try: + return _decrypt_database(db_ciphertext, main_key, iv) + except (zlib.error, ValueError): + return None + def _decrypt_crypt12(database: bytes, main_key: bytes) -> bytes: """Decrypt a crypt12 database. diff --git a/Whatsapp_Chat_Exporter/android_handler.py b/Whatsapp_Chat_Exporter/android_handler.py index 33bd589..49bf215 100644 --- a/Whatsapp_Chat_Exporter/android_handler.py +++ b/Whatsapp_Chat_Exporter/android_handler.py @@ -4,13 +4,14 @@ import logging import sqlite3 import os import shutil +from tqdm import tqdm from pathlib import Path from mimetypes import MimeTypes from markupsafe import escape as htmle from base64 import b64decode, b64encode from datetime import datetime -from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device +from Whatsapp_Chat_Exporter.data_model import ChatStore, Message, Timing +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, MAX_SIZE, ROW_SIZE, JidType, Device from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata from Whatsapp_Chat_Exporter.utility import get_chat_condition, safe_name, bytes_to_readable @@ -47,12 +48,15 @@ def contacts(db, data, enrich_from_vcards): logger.info(f"Processed {total_row_number} contacts\n") c.execute("SELECT jid, COALESCE(display_name, wa_name) as display_name, status FROM wa_contacts;") - row = c.fetchone() - while row is not None: - current_chat = data.add_chat(row["jid"], ChatStore(Device.ANDROID, row["display_name"])) - if row["status"] is not None: - current_chat.status = row["status"] - row = c.fetchone() + + with tqdm(total=total_row_number, desc="Processing contacts", unit="contact", leave=False) as pbar: + while (row := _fetch_row_safely(c)) is not None: + current_chat = data.add_chat(row["jid"], ChatStore(Device.ANDROID, row["display_name"])) + if row["status"] is not None: + current_chat.status = row["status"] + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} contacts in {convert_time_unit(total_time)}{CLEAR_LINE}") return True @@ -72,7 +76,6 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """ c = db.cursor() total_row_number = _get_message_count(c, filter_empty, filter_date, filter_chat) - logger.info(f"Processing messages...(0/{total_row_number})\r") try: content_cursor = _get_messages_cursor_legacy(c, filter_empty, filter_date, filter_chat) @@ -84,23 +87,12 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, except Exception as e: raise e - i = 0 - # Fetch the first row safely - content = _fetch_row_safely(content_cursor) - - while content is not None: - _process_single_message(data, content, table_message, timezone_offset) - - i += 1 - if i % 1000 == 0: - logger.info(f"Processing messages...({i}/{total_row_number})\r") - - # Fetch the next row safely - content = _fetch_row_safely(content_cursor) - - _get_reactions(db, data) - logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}") - + with tqdm(total=total_row_number, desc="Processing messages", unit="msg", leave=False) as pbar: + while (content := _fetch_row_safely(content_cursor)) is not None: + _process_single_message(data, content, table_message, timezone_offset) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} messages in {convert_time_unit(total_time)}{CLEAR_LINE}") # Helper functions for message processing @@ -126,14 +118,16 @@ def _get_message_count(cursor, filter_empty, filter_date, filter_chat): {include_filter} {exclude_filter}""") except sqlite3.OperationalError: - empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast") + empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") date_filter = f'AND timestamp {filter_date}' if filter_date is not None else '' include_filter = get_chat_condition( - filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + filter_chat[0], True, ["key_remote_jid", "group_sender_jid"], "jid", "android") exclude_filter = get_chat_condition( - filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + filter_chat[1], False, ["key_remote_jid", "group_sender_jid"], "jid", "android") - cursor.execute(f"""SELECT count() + cursor.execute(f"""SELECT count(), + COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid, + COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid FROM message LEFT JOIN chat ON chat._id = message.chat_row_id @@ -141,6 +135,14 @@ def _get_message_count(cursor, filter_empty, filter_date, filter_chat): ON jid._id = chat.jid_row_id LEFT JOIN jid jid_group ON jid_group._id = message.sender_jid_row_id + LEFT JOIN jid_map as jid_map_global + ON chat.jid_row_id = jid_map_global.lid_row_id + LEFT JOIN jid lid_global + ON jid_map_global.jid_row_id = lid_global._id + LEFT JOIN jid_map as jid_map_group + ON message.sender_jid_row_id = jid_map_group.lid_row_id + LEFT JOIN jid lid_group + ON jid_map_group.jid_row_id = lid_group._id WHERE 1=1 {empty_filter} {date_filter} @@ -219,11 +221,11 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat): empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' include_filter = get_chat_condition( - filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") + filter_chat[0], True, ["key_remote_jid", "lid_group.raw_string"], "jid_global", "android") exclude_filter = get_chat_condition( - filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android") + filter_chat[1], False, ["key_remote_jid", "lid_group.raw_string"], "jid_global", "android") - cursor.execute(f"""SELECT jid_global.raw_string as key_remote_jid, + cursor.execute(f"""SELECT COALESCE(lid_global.raw_string, jid_global.raw_string) as key_remote_jid, message._id, message.from_me as key_from_me, message.timestamp, @@ -238,7 +240,7 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat): message.key_id, message_quoted.text_data as quoted_data, message.message_type as media_wa_type, - jid_group.raw_string as group_sender_jid, + COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid, chat.subject as chat_subject, missed_call_logs.video_call, message.sender_jid_row_id, @@ -248,7 +250,8 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat): jid_new.raw_string as new_jid, jid_global.type as jid_type, COALESCE(receipt_user.receipt_timestamp, message.received_timestamp) as received_timestamp, - COALESCE(receipt_user.read_timestamp, receipt_user.played_timestamp) as read_timestamp + COALESCE(receipt_user.read_timestamp, receipt_user.played_timestamp) as read_timestamp, + message_media.raw_transcription_text as transcription_text FROM message LEFT JOIN message_quoted ON message_quoted.message_row_id = message._id @@ -280,6 +283,14 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat): ON jid_new._id = message_system_number_change.new_jid_row_id LEFT JOIN receipt_user ON receipt_user.message_row_id = message._id + LEFT JOIN jid_map as jid_map_global + ON chat.jid_row_id = jid_map_global.lid_row_id + LEFT JOIN jid lid_global + ON jid_map_global.jid_row_id = lid_global._id + LEFT JOIN jid_map as jid_map_group + ON message.sender_jid_row_id = jid_map_group.lid_row_id + LEFT JOIN jid lid_group + ON jid_map_group.jid_row_id = lid_group._id WHERE key_remote_jid <> '-1' {empty_filter} {date_filter} @@ -321,7 +332,7 @@ def _process_single_message(data, content, table_message, timezone_offset): timestamp=content["timestamp"], time=content["timestamp"], key_id=content["key_id"], - timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET, + timezone_offset=timezone_offset, message_type=content["media_wa_type"], received_timestamp=content["received_timestamp"], read_timestamp=content["read_timestamp"] @@ -353,9 +364,12 @@ def _process_single_message(data, content, table_message, timezone_offset): if not table_message and content["media_caption"] is not None: # Old schema message.caption = content["media_caption"] - elif table_message and content["media_wa_type"] == 1 and content["data"] is not None: + elif table_message: # New schema - message.caption = content["data"] + if content["media_wa_type"] == 1 and content["data"] is not None: + message.caption = content["data"] + elif content["media_wa_type"] == 2 and content["transcription_text"] is not None: + message.caption = f'"{content["transcription_text"]}"' else: message.caption = None @@ -547,7 +561,7 @@ def _get_reactions(db, data): logger.info(f"Processed reactions{CLEAR_LINE}") -def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=True): +def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=True, fix_dot_files=False): """ Process WhatsApp media files from the database. @@ -562,8 +576,6 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa """ c = db.cursor() total_row_number = _get_media_count(c, filter_empty, filter_date, filter_chat) - logger.info(f"Processing media...(0/{total_row_number})\r") - try: content_cursor = _get_media_cursor_legacy(c, filter_empty, filter_date, filter_chat) except sqlite3.OperationalError: @@ -575,18 +587,12 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa # Ensure thumbnails directory exists Path(f"{media_folder}/thumbnails").mkdir(parents=True, exist_ok=True) - i = 0 - while content is not None: - _process_single_media(data, content, media_folder, mime, separate_media) - - i += 1 - if i % 100 == 0: - logger.info(f"Processing media...({i}/{total_row_number})\r") - - content = content_cursor.fetchone() - - logger.info(f"Processed {total_row_number} media{CLEAR_LINE}") - + with tqdm(total=total_row_number, desc="Processing media", unit="media", leave=False) as pbar: + while (content := _fetch_row_safely(content_cursor)) is not None: + _process_single_media(data, content, media_folder, mime, separate_media, fix_dot_files) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} media in {convert_time_unit(total_time)}{CLEAR_LINE}") # Helper functions for media processing @@ -617,11 +623,13 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat): empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' include_filter = get_chat_condition( - filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + filter_chat[0], True, ["key_remote_jid", "group_sender_jid"], "jid", "android") exclude_filter = get_chat_condition( - filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android") + filter_chat[1], False, ["key_remote_jid", "group_sender_jid"], "jid", "android") - cursor.execute(f"""SELECT count() + cursor.execute(f"""SELECT count(), + COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid, + COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid FROM message_media INNER JOIN message ON message_media.message_row_id = message._id @@ -631,6 +639,14 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat): ON jid._id = chat.jid_row_id LEFT JOIN jid jid_group ON jid_group._id = message.sender_jid_row_id + LEFT JOIN jid_map as jid_map_global + ON chat.jid_row_id = jid_map_global.lid_row_id + LEFT JOIN jid lid_global + ON jid_map_global.jid_row_id = lid_global._id + LEFT JOIN jid_map as jid_map_group + ON message.sender_jid_row_id = jid_map_group.lid_row_id + LEFT JOIN jid lid_group + ON jid_map_group.jid_row_id = lid_group._id WHERE 1=1 {empty_filter} {date_filter} @@ -679,18 +695,19 @@ def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat): empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast") date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else '' include_filter = get_chat_condition( - filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + filter_chat[0], True, ["key_remote_jid", "group_sender_jid"], "jid", "android") exclude_filter = get_chat_condition( - filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android") + filter_chat[1], False, ["key_remote_jid", "group_sender_jid"], "jid", "android") - cursor.execute(f"""SELECT jid.raw_string as key_remote_jid, + cursor.execute(f"""SELECT COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid, message_row_id, file_path, message_url, mime_type, media_key, file_hash, - thumbnail + thumbnail, + COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid FROM message_media INNER JOIN message ON message_media.message_row_id = message._id @@ -702,6 +719,14 @@ def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat): ON message_media.file_hash = media_hash_thumbnail.media_hash LEFT JOIN jid jid_group ON jid_group._id = message.sender_jid_row_id + LEFT JOIN jid_map as jid_map_global + ON chat.jid_row_id = jid_map_global.lid_row_id + LEFT JOIN jid lid_global + ON jid_map_global.jid_row_id = lid_global._id + LEFT JOIN jid_map as jid_map_group + ON message.sender_jid_row_id = jid_map_group.lid_row_id + LEFT JOIN jid lid_group + ON jid_map_group.jid_row_id = lid_group._id WHERE jid.type <> 7 {empty_filter} {date_filter} @@ -711,7 +736,7 @@ def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat): return cursor -def _process_single_media(data, content, media_folder, mime, separate_media): +def _process_single_media(data, content, media_folder, mime, separate_media, fix_dot_files=False): """Process a single media file.""" file_path = f"{media_folder}/{content['file_path']}" current_chat = data.get_chat(content["key_remote_jid"]) @@ -719,8 +744,6 @@ def _process_single_media(data, content, media_folder, mime, separate_media): message.media = True if os.path.isfile(file_path): - message.data = file_path - # Set mime type if content["mime_type"] is None: guess = mime.guess_type(file_path)[0] @@ -730,6 +753,16 @@ def _process_single_media(data, content, media_folder, mime, separate_media): message.mime = "application/octet-stream" else: message.mime = content["mime_type"] + + if fix_dot_files and file_path.endswith("."): + extension = mime.guess_extension(message.mime) + if message.mime == "application/octet-stream" or not extension: + new_file_path = file_path[:-1] + else: + extension = mime.guess_extension(message.mime) + new_file_path = file_path[:-1] + extension + os.rename(file_path, new_file_path) + file_path = new_file_path # Copy media to separate folder if needed if separate_media: @@ -741,6 +774,8 @@ def _process_single_media(data, content, media_folder, mime, separate_media): new_path = os.path.join(new_folder, current_filename) shutil.copy2(file_path, new_path) message.data = new_path + else: + message.data = file_path else: message.data = "The media is missing" message.mime = "media" @@ -764,45 +799,56 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): rows = _execute_vcard_query_legacy(c, filter_date, filter_chat, filter_empty) total_row_number = len(rows) - logger.info(f"Processing vCards...(0/{total_row_number})\r") # Create vCards directory if it doesn't exist path = os.path.join(media_folder, "vCards") Path(path).mkdir(parents=True, exist_ok=True) - for index, row in enumerate(rows): - _process_vcard_row(row, path, data) - logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r") - logger.info(f"Processed {total_row_number} vCards{CLEAR_LINE}") - + with tqdm(total=total_row_number, desc="Processing vCards", unit="vcard", leave=False) as pbar: + for row in rows: + _process_vcard_row(row, path, data) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} vCards in {convert_time_unit(total_time)}{CLEAR_LINE}") def _execute_vcard_query_modern(c, filter_date, filter_chat, filter_empty): """Execute vCard query for modern WhatsApp database schema.""" # Build the filter conditions - chat_filter_include = get_chat_condition( - filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android") - chat_filter_exclude = get_chat_condition( - filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android") date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else '' empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push") + include_filter = get_chat_condition( + filter_chat[0], True, ["key_remote_jid", "group_sender_jid"], "jid", "android") + exclude_filter = get_chat_condition( + filter_chat[1], False, ["key_remote_jid", "group_sender_jid"], "jid", "android") query = f"""SELECT message_row_id, - messages.key_remote_jid, - vcard, - messages.media_name - FROM messages_vcards - INNER JOIN messages - ON messages_vcards.message_row_id = messages._id - INNER JOIN jid - ON messages.key_remote_jid = jid.raw_string - LEFT JOIN chat - ON chat.jid_row_id = jid._id + COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid, + vcard, + messages.media_name, + COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid + FROM messages_vcards + INNER JOIN messages + ON messages_vcards.message_row_id = messages._id + INNER JOIN jid + ON messages.key_remote_jid = jid.raw_string + LEFT JOIN chat + ON chat.jid_row_id = jid._id + LEFT JOIN jid jid_group + ON jid_group._id = message.sender_jid_row_id + LEFT JOIN jid_map as jid_map_global + ON chat.jid_row_id = jid_map_global.lid_row_id + LEFT JOIN jid lid_global + ON jid_map_global.jid_row_id = lid_global._id + LEFT JOIN jid_map as jid_map_group + ON message.sender_jid_row_id = jid_map_group.lid_row_id + LEFT JOIN jid lid_group + ON jid_map_group.jid_row_id = lid_group._id WHERE 1=1 {empty_filter} {date_filter} - {chat_filter_include} - {chat_filter_exclude} + {include_filter} + {exclude_filter} ORDER BY messages.key_remote_jid ASC;""" c.execute(query) return c.fetchall() @@ -879,32 +925,37 @@ def calls(db, data, timezone_offset, filter_chat): chat = ChatStore(Device.ANDROID, "WhatsApp Calls") # Process each call - content = calls_data.fetchone() - while content is not None: - _process_call_record(content, chat, data, timezone_offset) - content = calls_data.fetchone() + with tqdm(total=total_row_number, desc="Processing calls", unit="call", leave=False) as pbar: + while (content := _fetch_row_safely(calls_data)) is not None: + _process_call_record(content, chat, data, timezone_offset) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] # Add the calls chat to the data data.add_chat("000000000000000", chat) - logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}") - + logger.info(f"Processed {total_row_number} calls in {convert_time_unit(total_time)}{CLEAR_LINE}") def _get_calls_count(c, filter_chat): """Get the count of call records that match the filter.""" # Build the filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["jid.raw_string"]) - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["jid.raw_string"]) + include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid"]) + exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid"]) - query = f"""SELECT count() + query = f"""SELECT count(), + COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid FROM call_log INNER JOIN jid ON call_log.jid_row_id = jid._id LEFT JOIN chat ON call_log.jid_row_id = chat.jid_row_id + LEFT JOIN jid_map as jid_map_global + ON chat.jid_row_id = jid_map_global.lid_row_id + LEFT JOIN jid lid_global + ON jid_map_global.jid_row_id = lid_global._id WHERE 1=1 - {chat_filter_include} - {chat_filter_exclude}""" + {include_filter} + {exclude_filter}""" c.execute(query) return c.fetchone()[0] @@ -913,11 +964,11 @@ def _fetch_calls_data(c, filter_chat): """Fetch call data from the database.""" # Build the filter conditions - chat_filter_include = get_chat_condition(filter_chat[0], True, ["jid.raw_string"]) - chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["jid.raw_string"]) + include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid"]) + exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid"]) query = f"""SELECT call_log._id, - jid.raw_string, + COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid, from_me, call_id, timestamp, @@ -931,9 +982,13 @@ def _fetch_calls_data(c, filter_chat): ON call_log.jid_row_id = jid._id LEFT JOIN chat ON call_log.jid_row_id = chat.jid_row_id + LEFT JOIN jid_map as jid_map_global + ON chat.jid_row_id = jid_map_global.lid_row_id + LEFT JOIN jid lid_global + ON jid_map_global.jid_row_id = lid_global._id WHERE 1=1 - {chat_filter_include} - {chat_filter_exclude}""" + {include_filter} + {exclude_filter}""" c.execute(query) return c @@ -945,13 +1000,13 @@ def _process_call_record(content, chat, data, timezone_offset): timestamp=content["timestamp"], time=content["timestamp"], key_id=content["call_id"], - timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET, + timezone_offset=timezone_offset, received_timestamp=None, # TODO: Add timestamp read_timestamp=None # TODO: Add timestamp ) # Get caller/callee name - _jid = content["raw_string"] + _jid = content["key_remote_jid"] name = data.get_chat(_jid).name if _jid in data else content["chat_subject"] or None if _jid is not None and "@" in _jid: fallback = _jid.split('@')[0] @@ -996,6 +1051,7 @@ def _construct_call_description(content, call): return description +# TODO: Marked for enhancement on multi-threaded processing def create_html( data, output_folder, @@ -1011,7 +1067,6 @@ def create_html( template = setup_template(template, no_avatar, experimental) total_row_number = len(data) - logger.info(f"Generating chats...(0/{total_row_number})\r") # Create output directory if it doesn't exist if not os.path.isdir(output_folder): @@ -1019,43 +1074,42 @@ def create_html( w3css = get_status_location(output_folder, offline_static) - for current, contact in enumerate(data): - current_chat = data.get_chat(contact) - if len(current_chat) == 0: - # Skip empty chats - continue + with tqdm(total=total_row_number, desc="Generating HTML", unit="file", leave=False) as pbar: + for contact in data: + current_chat = data.get_chat(contact) + if len(current_chat) == 0: + # Skip empty chats + continue - safe_file_name, name = get_file_name(contact, current_chat) + safe_file_name, name = get_file_name(contact, current_chat) - if maximum_size is not None: - _generate_paginated_chat( - current_chat, - safe_file_name, - name, - contact, - output_folder, - template, - w3css, - maximum_size, - headline - ) - else: - _generate_single_chat( - current_chat, - safe_file_name, - name, - contact, - output_folder, - template, - w3css, - headline - ) - - if current % 10 == 0: - logger.info(f"Generating chats...({current}/{total_row_number})\r") - - logger.info(f"Generated {total_row_number} chats{CLEAR_LINE}") + if maximum_size is not None: + _generate_paginated_chat( + current_chat, + safe_file_name, + name, + contact, + output_folder, + template, + w3css, + maximum_size, + headline + ) + else: + _generate_single_chat( + current_chat, + safe_file_name, + name, + contact, + output_folder, + template, + w3css, + headline + ) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Generated {total_row_number} chats in {convert_time_unit(total_time)}{CLEAR_LINE}") def _generate_single_chat(current_chat, safe_file_name, name, contact, output_folder, template, w3css, headline): """Generate a single HTML file for a chat.""" diff --git a/Whatsapp_Chat_Exporter/data_model.py b/Whatsapp_Chat_Exporter/data_model.py index 47034e6..9ea0d7b 100644 --- a/Whatsapp_Chat_Exporter/data_model.py +++ b/Whatsapp_Chat_Exporter/data_model.py @@ -279,7 +279,7 @@ class Message: key_id: Union[int, str], received_timestamp: int = None, read_timestamp: int = None, - timezone_offset: int = 0, + timezone_offset: Optional[Timing] = Timing(0), message_type: Optional[int] = None ) -> None: """ @@ -300,10 +300,9 @@ class Message: """ self.from_me = bool(from_me) self.timestamp = timestamp / 1000 if timestamp > 9999999999 else timestamp - timing = Timing(timezone_offset) if isinstance(time, (int, float)): - self.time = timing.format_timestamp(self.timestamp, "%H:%M") + self.time = timezone_offset.format_timestamp(self.timestamp, "%H:%M") elif isinstance(time, str): self.time = time else: @@ -318,14 +317,14 @@ class Message: self.mime = None self.message_type = message_type if isinstance(received_timestamp, (int, float)): - self.received_timestamp = timing.format_timestamp( + self.received_timestamp = timezone_offset.format_timestamp( received_timestamp, "%Y/%m/%d %H:%M") elif isinstance(received_timestamp, str): self.received_timestamp = received_timestamp else: self.received_timestamp = None if isinstance(read_timestamp, (int, float)): - self.read_timestamp = timing.format_timestamp( + self.read_timestamp = timezone_offset.format_timestamp( read_timestamp, "%Y/%m/%d %H:%M") elif isinstance(read_timestamp, str): self.read_timestamp = read_timestamp diff --git a/Whatsapp_Chat_Exporter/exported_handler.py b/Whatsapp_Chat_Exporter/exported_handler.py index 9e53c23..984dbe1 100644 --- a/Whatsapp_Chat_Exporter/exported_handler.py +++ b/Whatsapp_Chat_Exporter/exported_handler.py @@ -4,8 +4,9 @@ import os import logging from datetime import datetime from mimetypes import MimeTypes +from tqdm import tqdm from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, Device +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, Device, convert_time_unit logger = logging.getLogger(__name__) @@ -34,17 +35,16 @@ def messages(path, data, assume_first_as_me=False): # Second pass: process the messages with open(path, "r", encoding="utf8") as file: - for index, line in enumerate(file): - you, user_identification_done = process_line( - line, index, chat, path, you, - assume_first_as_me, user_identification_done - ) + with tqdm(total=total_row_number, desc="Processing messages & media", unit="msg&media", leave=False) as pbar: + for index, line in enumerate(file): + you, user_identification_done = process_line( + line, index, chat, path, you, + assume_first_as_me, user_identification_done + ) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} messages & media in {convert_time_unit(total_time)}{CLEAR_LINE}") - # Show progress - if index % 1000 == 0: - logger.info(f"Processing messages & media...({index}/{total_row_number})\r") - - logger.info(f"Processed {total_row_number} messages & media{CLEAR_LINE}") return data diff --git a/Whatsapp_Chat_Exporter/ios_handler.py b/Whatsapp_Chat_Exporter/ios_handler.py index 5a3230e..c0f86fe 100644 --- a/Whatsapp_Chat_Exporter/ios_handler.py +++ b/Whatsapp_Chat_Exporter/ios_handler.py @@ -4,12 +4,13 @@ import os import logging import shutil from glob import glob +from tqdm import tqdm from pathlib import Path from mimetypes import MimeTypes from markupsafe import escape as htmle from Whatsapp_Chat_Exporter.data_model import ChatStore, Message -from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition -from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name, Device +from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, get_chat_condition, Device +from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name logger = logging.getLogger(__name__) @@ -23,17 +24,18 @@ def contacts(db, data): logger.info(f"Pre-processing contacts...({total_row_number})\r") c.execute("""SELECT ZWHATSAPPID, ZABOUTTEXT FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""") - content = c.fetchone() - while content is not None: - zwhatsapp_id = content["ZWHATSAPPID"] - if not zwhatsapp_id.endswith("@s.whatsapp.net"): - zwhatsapp_id += "@s.whatsapp.net" + with tqdm(total=total_row_number, desc="Processing contacts", unit="contact", leave=False) as pbar: + while (content := c.fetchone()) is not None: + zwhatsapp_id = content["ZWHATSAPPID"] + if not zwhatsapp_id.endswith("@s.whatsapp.net"): + zwhatsapp_id += "@s.whatsapp.net" - current_chat = ChatStore(Device.IOS) - current_chat.status = content["ZABOUTTEXT"] - data.add_chat(zwhatsapp_id, current_chat) - content = c.fetchone() - logger.info(f"Pre-processed {total_row_number} contacts{CLEAR_LINE}") + current_chat = ChatStore(Device.IOS) + current_chat.status = content["ZABOUTTEXT"] + data.add_chat(zwhatsapp_id, current_chat) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Pre-processed {total_row_number} contacts in {convert_time_unit(total_time)}{CLEAR_LINE}") def process_contact_avatars(current_chat, media_folder, contact_id): @@ -92,7 +94,6 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, """ c.execute(contact_query) total_row_number = c.fetchone()[0] - logger.info(f"Processing contacts...({total_row_number})\r") # Get distinct contacts contacts_query = f""" @@ -114,24 +115,24 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, c.execute(contacts_query) # Process each contact - content = c.fetchone() - while content is not None: - contact_name = get_contact_name(content) - contact_id = content["ZCONTACTJID"] + with tqdm(total=total_row_number, desc="Processing contacts", unit="contact", leave=False) as pbar: + while (content := c.fetchone()) is not None: + contact_name = get_contact_name(content) + contact_id = content["ZCONTACTJID"] - # Add or update chat - if contact_id not in data: - current_chat = data.add_chat(contact_id, ChatStore(Device.IOS, contact_name, media_folder)) - else: - current_chat = data.get_chat(contact_id) - current_chat.name = contact_name - current_chat.my_avatar = os.path.join(media_folder, "Media/Profile/Photo.jpg") + # Add or update chat + if contact_id not in data: + current_chat = data.add_chat(contact_id, ChatStore(Device.IOS, contact_name, media_folder)) + else: + current_chat = data.get_chat(contact_id) + current_chat.name = contact_name + current_chat.my_avatar = os.path.join(media_folder, "Media/Profile/Photo.jpg") - # Process avatar images - process_contact_avatars(current_chat, media_folder, contact_id) - content = c.fetchone() - - logger.info(f"Processed {total_row_number} contacts{CLEAR_LINE}") + # Process avatar images + process_contact_avatars(current_chat, media_folder, contact_id) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} contacts in {convert_time_unit(total_time)}{CLEAR_LINE}") # Get message count message_count_query = f""" @@ -190,46 +191,42 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat, message_map = {row[0][:17]: row[1] or row[2] for row in cursor2.fetchall() if row[0]} # Process each message - i = 0 - content = c.fetchone() - while content is not None: - contact_id = content["ZCONTACTJID"] - message_pk = content["Z_PK"] - is_group_message = content["ZGROUPINFO"] is not None + with tqdm(total=total_row_number, desc="Processing messages", unit="msg", leave=False) as pbar: + while (content := c.fetchone()) is not None: + contact_id = content["ZCONTACTJID"] + message_pk = content["Z_PK"] + is_group_message = content["ZGROUPINFO"] is not None - # Ensure chat exists - if contact_id not in data: - current_chat = data.add_chat(contact_id, ChatStore(Device.IOS)) - process_contact_avatars(current_chat, media_folder, contact_id) - else: - current_chat = data.get_chat(contact_id) + # Ensure chat exists + if contact_id not in data: + current_chat = data.add_chat(contact_id, ChatStore(Device.IOS)) + process_contact_avatars(current_chat, media_folder, contact_id) + else: + current_chat = data.get_chat(contact_id) - # Create message object - ts = APPLE_TIME + content["ZMESSAGEDATE"] - message = Message( - from_me=content["ZISFROMME"], - timestamp=ts, - time=ts, - key_id=content["ZSTANZAID"][:17], - timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET, - message_type=content["ZMESSAGETYPE"], - received_timestamp=APPLE_TIME + content["ZSENTDATE"] if content["ZSENTDATE"] else None, - read_timestamp=None # TODO: Add timestamp - ) + # Create message object + ts = APPLE_TIME + content["ZMESSAGEDATE"] + message = Message( + from_me=content["ZISFROMME"], + timestamp=ts, + time=ts, + key_id=content["ZSTANZAID"][:17], + timezone_offset=timezone_offset, + message_type=content["ZMESSAGETYPE"], + received_timestamp=APPLE_TIME + content["ZSENTDATE"] if content["ZSENTDATE"] else None, + read_timestamp=None # TODO: Add timestamp + ) - # Process message data - invalid = process_message_data(message, content, is_group_message, data, message_map, no_reply) + # Process message data + invalid = process_message_data(message, content, is_group_message, data, message_map, no_reply) - # Add valid messages to chat - if not invalid: - current_chat.add_message(message_pk, message) + # Add valid messages to chat + if not invalid: + current_chat.add_message(message_pk, message) - # Update progress - i += 1 - if i % 1000 == 0: - logger.info(f"Processing messages...({i}/{total_row_number})\r") - content = c.fetchone() - logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}") + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} messages in {convert_time_unit(total_time)}{CLEAR_LINE}") def process_message_data(message, content, is_group_message, data, message_map, no_reply): @@ -315,7 +312,7 @@ def process_message_text(message, content): message.data = msg -def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=False): +def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=False, fix_dot_files=False): """Process media files from WhatsApp messages.""" c = db.cursor() @@ -371,20 +368,15 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa # Process each media item mime = MimeTypes() - i = 0 - content = c.fetchone() - while content is not None: - process_media_item(content, data, media_folder, mime, separate_media) - - # Update progress - i += 1 - if i % 100 == 0: - logger.info(f"Processing media...({i}/{total_row_number})\r") - content = c.fetchone() - logger.info(f"Processed {total_row_number} media{CLEAR_LINE}") + with tqdm(total=total_row_number, desc="Processing media", unit="media", leave=False) as pbar: + while (content := c.fetchone()) is not None: + process_media_item(content, data, media_folder, mime, separate_media, fix_dot_files) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} media in {convert_time_unit(total_time)}{CLEAR_LINE}") -def process_media_item(content, data, media_folder, mime, separate_media): +def process_media_item(content, data, media_folder, mime, separate_media, fix_dot_files=False): """Process a single media item.""" file_path = f"{media_folder}/Message/{content['ZMEDIALOCALPATH']}" current_chat = data.get_chat(content["ZCONTACTJID"]) @@ -395,14 +387,22 @@ def process_media_item(content, data, media_folder, mime, separate_media): current_chat.media_base = media_folder + "/" if os.path.isfile(file_path): - message.data = '/'.join(file_path.split("/")[1:]) - # Set MIME type if content["ZVCARDSTRING"] is None: guess = mime.guess_type(file_path)[0] message.mime = guess if guess is not None else "application/octet-stream" else: message.mime = content["ZVCARDSTRING"] + + if fix_dot_files and file_path.endswith("."): + extension = mime.guess_extension(message.mime) + if message.mime == "application/octet-stream" or not extension: + new_file_path = file_path[:-1] + else: + extension = mime.guess_extension(message.mime) + new_file_path = file_path[:-1] + extension + os.rename(file_path, new_file_path) + file_path = new_file_path # Handle separate media option if separate_media: @@ -413,7 +413,9 @@ def process_media_item(content, data, media_folder, mime, separate_media): Path(new_folder).mkdir(parents=True, exist_ok=True) new_path = os.path.join(new_folder, current_filename) shutil.copy2(file_path, new_path) - message.data = '/'.join(new_path.split("\\")[1:]) + message.data = '/'.join(new_path.split("/")[1:]) + else: + message.data = '/'.join(file_path.split("/")[1:]) else: # Handle missing media message.data = "The media is missing" @@ -467,10 +469,12 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty): Path(path).mkdir(parents=True, exist_ok=True) # Process each vCard - for index, content in enumerate(contents): - process_vcard_item(content, path, data) - logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r") - logger.info(f"Processed {total_row_number} vCards{CLEAR_LINE}") + with tqdm(total=total_row_number, desc="Processing vCards", unit="vcard", leave=False) as pbar: + for content in contents: + process_vcard_item(content, path, data) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Processed {total_row_number} vCards in {convert_time_unit(total_time)}{CLEAR_LINE}") def process_vcard_item(content, path, data): @@ -530,8 +534,6 @@ def calls(db, data, timezone_offset, filter_chat): if total_row_number == 0: return - logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}\n") - # Fetch call records calls_query = f""" SELECT ZCALLIDSTRING, @@ -556,14 +558,15 @@ def calls(db, data, timezone_offset, filter_chat): # Create calls chat chat = ChatStore(Device.ANDROID, "WhatsApp Calls") - # Process each call - content = c.fetchone() - while content is not None: - process_call_record(content, chat, data, timezone_offset) - content = c.fetchone() + with tqdm(total=total_row_number, desc="Processing calls", unit="call", leave=False) as pbar: + while (content := c.fetchone()) is not None: + process_call_record(content, chat, data, timezone_offset) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] # Add calls chat to data data.add_chat("000000000000000", chat) + logger.info(f"Processed {total_row_number} calls in {convert_time_unit(total_time)}{CLEAR_LINE}") def process_call_record(content, chat, data, timezone_offset): @@ -574,7 +577,7 @@ def process_call_record(content, chat, data, timezone_offset): timestamp=ts, time=ts, key_id=content["ZCALLIDSTRING"], - timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET + timezone_offset=timezone_offset ) # Set sender info diff --git a/Whatsapp_Chat_Exporter/ios_media_handler.py b/Whatsapp_Chat_Exporter/ios_media_handler.py index 4416727..56df3d0 100644 --- a/Whatsapp_Chat_Exporter/ios_media_handler.py +++ b/Whatsapp_Chat_Exporter/ios_media_handler.py @@ -6,7 +6,9 @@ import sqlite3 import os import getpass from sys import exit, platform as osname -from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, WhatsAppIdentifier +import sys +from tqdm import tqdm +from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, WhatsAppIdentifier, convert_time_unit from Whatsapp_Chat_Exporter.bplist import BPListReader try: from iphone_backup_decrypt import EncryptedBackup, RelativePath @@ -79,6 +81,8 @@ class BackupExtractor: logger.info(f"Encryption detected on the backup!{CLEAR_LINE}") password = getpass.getpass("Enter the password for the backup:") + sys.stdout.write("\033[F\033[K") + sys.stdout.flush() self._decrypt_backup(password) self._extract_decrypted_files() @@ -89,7 +93,7 @@ class BackupExtractor: Args: password (str): The password for the encrypted backup. """ - logger.info(f"Trying to decrypt the iOS backup...{CLEAR_LINE}") + logger.info(f"Trying to open the iOS backup...{CLEAR_LINE}") self.backup = EncryptedBackup( backup_directory=self.base_dir, passphrase=password, @@ -97,7 +101,7 @@ class BackupExtractor: check_same_thread=False, decrypt_chunk_size=self.decrypt_chunk_size, ) - logger.info(f"iOS backup decrypted successfully{CLEAR_LINE}") + logger.info(f"iOS backup is opened successfully{CLEAR_LINE}") logger.info("Decrypting WhatsApp database...\r") try: self.backup.extract_file( @@ -130,9 +134,12 @@ class BackupExtractor: def _extract_decrypted_files(self): """Extract all WhatsApp files after decryption""" + pbar = tqdm(desc="Decrypting and extracting files", unit="file", leave=False) def extract_progress_handler(file_id, domain, relative_path, n, total_files): - if n % 100 == 0: - logger.info(f"Decrypting and extracting files...({n}/{total_files})\r") + if pbar.total is None: + pbar.total = total_files + pbar.n = n + pbar.refresh() return True self.backup.extract_files( @@ -141,7 +148,9 @@ class BackupExtractor: preserve_folders=True, filter_callback=extract_progress_handler ) - logger.info(f"All required files are decrypted and extracted.{CLEAR_LINE}") + total_time = pbar.format_dict['elapsed'] + pbar.close() + logger.info(f"All required files are decrypted and extracted in {convert_time_unit(total_time)}{CLEAR_LINE}") def _extract_unencrypted_backup(self): """ @@ -192,7 +201,6 @@ class BackupExtractor: c = manifest.cursor() c.execute(f"SELECT count() FROM Files WHERE domain = '{_wts_id}'") total_row_number = c.fetchone()[0] - logger.info(f"Extracting WhatsApp files...(0/{total_row_number})\r") c.execute( f""" SELECT fileID, relativePath, flags, file AS metadata, @@ -205,33 +213,30 @@ class BackupExtractor: if not os.path.isdir(_wts_id): os.mkdir(_wts_id) - row = c.fetchone() - while row is not None: - if not row["relativePath"]: # Skip empty relative paths - row = c.fetchone() - continue + with tqdm(total=total_row_number, desc="Extracting WhatsApp files", unit="file", leave=False) as pbar: + while (row := c.fetchone()) is not None: + if not row["relativePath"]: # Skip empty relative paths + continue - destination = os.path.join(_wts_id, row["relativePath"]) - hashes = row["fileID"] - folder = hashes[:2] - flags = row["flags"] + destination = os.path.join(_wts_id, row["relativePath"]) + hashes = row["fileID"] + folder = hashes[:2] + flags = row["flags"] - if flags == 2: # Directory - try: - os.mkdir(destination) - except FileExistsError: - pass - elif flags == 1: # File - shutil.copyfile(os.path.join(self.base_dir, folder, hashes), destination) - metadata = BPListReader(row["metadata"]).parse() - creation = metadata["$objects"][1]["Birth"] - modification = metadata["$objects"][1]["LastModified"] - os.utime(destination, (modification, modification)) - - if row["_index"] % 100 == 0: - logger.info(f"Extracting WhatsApp files...({row['_index']}/{total_row_number})\r") - row = c.fetchone() - logger.info(f"Extracted WhatsApp files...({total_row_number}){CLEAR_LINE}") + if flags == 2: # Directory + try: + os.mkdir(destination) + except FileExistsError: + pass + elif flags == 1: # File + shutil.copyfile(os.path.join(self.base_dir, folder, hashes), destination) + metadata = BPListReader(row["metadata"]).parse() + _creation = metadata["$objects"][1]["Birth"] + modification = metadata["$objects"][1]["LastModified"] + os.utime(destination, (modification, modification)) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Extracted {total_row_number} WhatsApp files in {convert_time_unit(total_time)}{CLEAR_LINE}") def extract_media(base_dir, identifiers, decrypt_chunk_size): diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index a147dfb..12c7da0 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -5,13 +5,13 @@ import json import os import unicodedata import re -import string import math import shutil from bleach import clean as sanitize from markupsafe import Markup from datetime import datetime, timedelta from enum import IntEnum +from tqdm import tqdm from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore, Timing from typing import Dict, List, Optional, Tuple, Union try: @@ -248,13 +248,13 @@ def import_from_json(json_file: str, data: ChatCollection): with open(json_file, "r") as f: temp_data = json.loads(f.read()) total_row_number = len(tuple(temp_data.keys())) - logger.info(f"Importing chats from JSON...(0/{total_row_number})\r") - for index, (jid, chat_data) in enumerate(temp_data.items()): - chat = ChatStore.from_json(chat_data) - data.add_chat(jid, chat) - logger.info( - f"Importing chats from JSON...({index + 1}/{total_row_number})\r") - logger.info(f"Imported {total_row_number} chats from JSON{CLEAR_LINE}") + with tqdm(total=total_row_number, desc="Importing chats from JSON", unit="chat", leave=False) as pbar: + for jid, chat_data in temp_data.items(): + chat = ChatStore.from_json(chat_data) + data.add_chat(jid, chat) + pbar.update(1) + total_time = pbar.format_dict['elapsed'] + logger.info(f"Imported {total_row_number} chats from JSON in {convert_time_unit(total_time)}{CLEAR_LINE}") def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_print_json: int, avoid_encoding_json: bool): @@ -439,7 +439,7 @@ CRYPT14_OFFSETS = ( {"iv": 67, "db": 193}, {"iv": 67, "db": 194}, {"iv": 67, "db": 158}, - {"iv": 67, "db": 196} + {"iv": 67, "db": 196}, ) @@ -534,7 +534,7 @@ def determine_metadata(content: sqlite3.Row, init_msg: Optional[str]) -> Optiona else: msg = "The security code in this chat changed" elif content["action_type"] == 58: - msg = "You blocked this contact" + msg = "You blocked/unblocked this contact" elif content["action_type"] == 67: return # (PM) this contact use secure service from Facebook??? elif content["action_type"] == 69: @@ -639,11 +639,17 @@ def get_from_string(msg: Dict, chat_id: str) -> str: def get_chat_type(chat_id: str) -> str: """Return the chat type based on the whatsapp id""" - if chat_id.endswith("@s.whatsapp.net"): + if chat_id == "000000000000000": + return "calls" + elif chat_id.endswith("@s.whatsapp.net"): return "personal_chat" - if chat_id.endswith("@g.us"): + elif chat_id.endswith("@g.us"): return "private_group" - logger.warning("Unknown chat type for %s, defaulting to private_group", chat_id) + elif chat_id == "status@broadcast": + return "status_broadcast" + elif chat_id.endswith("@broadcast"): + return "broadcast_channel" + logger.warning(f"Unknown chat type for {chat_id}, defaulting to private_group{CLEAR_LINE}") return "private_group" @@ -674,34 +680,35 @@ def telegram_json_format(jik: str, data: Dict, timezone_offset) -> Dict: except ValueError: # not a real chat: e.g. statusbroadcast chat_id = 0 - obj = { - "name": data["name"] if data["name"] else jik, - "type": get_chat_type(jik), - "id": chat_id, - "messages": [ { - "id": int(msgId), - "type": "message", - "date": timing.format_timestamp(msg["timestamp"], "%Y-%m-%dT%H:%M:%S"), - "date_unixtime": int(msg["timestamp"]), - "from": get_from_string(msg, chat_id), - "from_id": get_from_id(msg, chat_id), - "reply_to_message_id": get_reply_id(data, msg["reply"]), - "text": msg["data"], - "text_entities": [ - { - # TODO this will lose formatting and different types - "type": "plain", - "text": msg["data"], - } - ], - } for msgId, msg in data["messages"].items()] + json_obj = { + "name": data["name"] if data["name"] else jik, + "type": get_chat_type(jik), + "id": chat_id, + "messages": [ { + "id": int(msgId), + "type": "message", + "date": timing.format_timestamp(msg["timestamp"], "%Y-%m-%dT%H:%M:%S"), + "date_unixtime": int(msg["timestamp"]), + "from": get_from_string(msg, chat_id), + "from_id": get_from_id(msg, chat_id), + "reply_to_message_id": get_reply_id(data, msg["reply"]), + "text": msg["data"], + "text_entities": [ + { + # TODO this will lose formatting and different types + "type": "plain", + "text": msg["data"], + } + ], } + for msgId, msg in data["messages"].items()] + } # remove empty messages and replies - for msg_id, msg in enumerate(obj["messages"]): + for msg_id, msg in enumerate(json_obj["messages"]): if not msg["reply_to_message_id"]: - del obj["messages"][msg_id]["reply_to_message_id"] - obj["messages"] = [m for m in obj["messages"] if m["text"]] - return obj + del json_obj["messages"][msg_id]["reply_to_message_id"] + json_obj["messages"] = [m for m in json_obj["messages"] if m["text"]] + return json_obj class WhatsAppIdentifier(StrEnum): diff --git a/Whatsapp_Chat_Exporter/whatsapp.html b/Whatsapp_Chat_Exporter/whatsapp.html index cb56d06..6ef972b 100644 --- a/Whatsapp_Chat_Exporter/whatsapp.html +++ b/Whatsapp_Chat_Exporter/whatsapp.html @@ -281,7 +281,9 @@ {% filter escape %}{{ msg.data }}{% endfilter %} {% endif %} {% if msg.caption is not none %} - {{ msg.caption | urlize(none, true, '_blank') }} +

+ {{ msg.caption | urlize(none, true, '_blank') }} +

{% endif %} {% endif %} {% endif %} @@ -351,7 +353,9 @@ {% filter escape %}{{ msg.data }}{% endfilter %} {% endif %} {% if msg.caption is not none %} - {{ msg.caption | urlize(none, true, '_blank') }} +

+ {{ msg.caption | urlize(none, true, '_blank') }} +

{% endif %} {% endif %} {% endif %} diff --git a/pyproject.toml b/pyproject.toml index c92baeb..f467f1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,17 +36,19 @@ classifiers = [ requires-python = ">=3.10" dependencies = [ "jinja2", - "bleach" + "bleach", + "tqdm" ] [project.optional-dependencies] android_backup = ["pycryptodome", "javaobj-py3"] +ios_backup = ["iphone_backup_decrypt @ git+https://github.com/KnugiHK/iphone_backup_decrypt"] crypt12 = ["pycryptodome"] crypt14 = ["pycryptodome"] crypt15 = ["pycryptodome", "javaobj-py3"] -all = ["pycryptodome", "javaobj-py3"] -everything = ["pycryptodome", "javaobj-py3"] -backup = ["pycryptodome", "javaobj-py3"] +all = ["pycryptodome", "javaobj-py3", "iphone_backup_decrypt @ git+https://github.com/KnugiHK/iphone_backup_decrypt"] +everything = ["pycryptodome", "javaobj-py3", "iphone_backup_decrypt @ git+https://github.com/KnugiHK/iphone_backup_decrypt"] +backup = ["pycryptodome", "javaobj-py3", "iphone_backup_decrypt @ git+https://github.com/KnugiHK/iphone_backup_decrypt"] [project.scripts] wtsexporter = "Whatsapp_Chat_Exporter.__main__:main" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..50f0866 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,27 @@ +import pytest +import os + +def pytest_collection_modifyitems(config, items): + """ + Moves test_nuitka_binary.py to the end and fails if the file is missing. + """ + target_file = "test_nuitka_binary.py" + + # Sanity Check: Ensure the file actually exists in the tests directory + test_dir = os.path.join(config.rootdir, "tests") + file_path = os.path.join(test_dir, target_file) + + if not os.path.exists(file_path): + pytest.exit(f"\n[FATAL] Required test file '{target_file}' not found in {test_dir}. " + f"Order enforcement failed!", returncode=1) + + nuitka_tests = [] + remaining_tests = [] + + for item in items: + if target_file in item.nodeid: + nuitka_tests.append(item) + else: + remaining_tests.append(item) + + items[:] = remaining_tests + nuitka_tests \ No newline at end of file