Merge branch 'dev' into feature/export-reactions

This commit is contained in:
Knugi
2026-01-19 20:38:20 +08:00
committed by GitHub
12 changed files with 541 additions and 422 deletions

View File

@@ -115,7 +115,7 @@ Do an iPhone/iPad Backup with iTunes/Finder first.
If you want to work on an encrypted iOS/iPadOS Backup, you should install iphone_backup_decrypt from [KnugiHK/iphone_backup_decrypt](https://github.com/KnugiHK/iphone_backup_decrypt) before you run the extract_iphone_media.py.
```sh
pip install git+https://github.com/KnugiHK/iphone_backup_decrypt
pip install whatsapp-chat-exporter["ios_backup"]
```
> [!NOTE]
> You will need to disable the built-in end-to-end encryption for WhatsApp backups. See [WhatsApp's FAQ](https://faq.whatsapp.com/490592613091019#turn-off-end-to-end-encrypted-backup) for how to do it.

View File

@@ -11,14 +11,15 @@ import logging
import importlib.metadata
from Whatsapp_Chat_Exporter import android_crypt, exported_handler, android_handler
from Whatsapp_Chat_Exporter import ios_handler, ios_media_handler
from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore
from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, Crypt, check_update
from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore, Timing
from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, Crypt
from Whatsapp_Chat_Exporter.utility import readable_to_bytes, safe_name, bytes_to_readable
from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, DbType
from Whatsapp_Chat_Exporter.utility import telegram_json_format
from Whatsapp_Chat_Exporter.utility import import_from_json, incremental_merge, check_update
from Whatsapp_Chat_Exporter.utility import telegram_json_format, convert_time_unit, DbType
from argparse import ArgumentParser, SUPPRESS
from datetime import datetime
from getpass import getpass
from tqdm import tqdm
from sys import exit
from typing import Optional, List, Dict
from Whatsapp_Chat_Exporter.vcards_contacts import ContactsFromVCards
@@ -286,13 +287,17 @@ def setup_argument_parser() -> ArgumentParser:
help="Specify the chunk size for decrypting iOS backup, which may affect the decryption speed."
)
misc_group.add_argument(
"--max-bruteforce-worker", dest="max_bruteforce_worker", default=10, type=int,
"--max-bruteforce-worker", dest="max_bruteforce_worker", default=4, type=int,
help="Specify the maximum number of worker for bruteforce decryption."
)
misc_group.add_argument(
"--no-banner", dest="no_banner", default=False, action='store_true',
help="Do not show the banner"
)
misc_group.add_argument(
"--fix-dot-files", dest="fix_dot_files", default=False, action='store_true',
help="Fix files with a dot at the end of their name (allowing the outputs be stored in FAT filesystems)"
)
return parser
@@ -537,6 +542,7 @@ def process_messages(args, data: ChatCollection) -> None:
exit(6)
filter_chat = (args.filter_chat_include, args.filter_chat_exclude)
timing = Timing(args.timezone_offset if args.timezone_offset else CURRENT_TZ_OFFSET)
with sqlite3.connect(msg_db) as db:
db.row_factory = sqlite3.Row
@@ -548,14 +554,14 @@ def process_messages(args, data: ChatCollection) -> None:
message_handler = ios_handler
message_handler.messages(
db, data, args.media, args.timezone_offset, args.filter_date,
db, data, args.media, timing, args.filter_date,
filter_chat, args.filter_empty, args.no_reply_ios
)
# Process media
message_handler.media(
db, data, args.media, args.filter_date,
filter_chat, args.filter_empty, args.separate_media
filter_chat, args.filter_empty, args.separate_media, args.fix_dot_files
)
# Process vcards
@@ -565,17 +571,17 @@ def process_messages(args, data: ChatCollection) -> None:
)
# Process calls
process_calls(args, db, data, filter_chat)
process_calls(args, db, data, filter_chat, timing)
def process_calls(args, db, data: ChatCollection, filter_chat) -> None:
def process_calls(args, db, data: ChatCollection, filter_chat, timing) -> None:
"""Process call history if available."""
if args.android:
android_handler.calls(db, data, args.timezone_offset, filter_chat)
android_handler.calls(db, data, timing, filter_chat)
elif args.ios and args.call_db_ios is not None:
with sqlite3.connect(args.call_db_ios) as cdb:
cdb.row_factory = sqlite3.Row
ios_handler.calls(cdb, data, args.timezone_offset, filter_chat)
ios_handler.calls(cdb, data, timing, filter_chat)
def handle_media_directory(args) -> None:
@@ -665,24 +671,27 @@ def export_multiple_json(args, data: Dict) -> None:
# Export each chat
total = len(data.keys())
for index, jik in enumerate(data.keys()):
if data[jik]["name"] is not None:
contact = data[jik]["name"].replace('/', '')
else:
contact = jik.replace('+', '')
with tqdm(total=total, desc="Generating JSON files", unit="file", leave=False) as pbar:
for jik in data.keys():
if data[jik]["name"] is not None:
contact = data[jik]["name"].replace('/', '')
else:
contact = jik.replace('+', '')
if args.telegram:
messages = telegram_json_format(jik, data[jik], args.timezone_offset)
else:
messages = {jik: data[jik]}
with open(f"{json_path}/{safe_name(contact)}.json", "w") as f:
file_content = json.dumps(
messages,
ensure_ascii=not args.avoid_encoding_json,
indent=args.pretty_print_json
)
f.write(file_content)
logger.info(f"Writing JSON file...({index + 1}/{total})\r")
if args.telegram:
messages = telegram_json_format(jik, data[jik], args.timezone_offset)
else:
messages = {jik: data[jik]}
with open(f"{json_path}/{safe_name(contact)}.json", "w") as f:
file_content = json.dumps(
messages,
ensure_ascii=not args.avoid_encoding_json,
indent=args.pretty_print_json
)
f.write(file_content)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Generated {total} JSON files in {convert_time_unit(total_time)}{CLEAR_LINE}")
def process_exported_chat(args, data: ChatCollection) -> None:

View File

@@ -1,13 +1,12 @@
import time
import hmac
import io
import logging
import threading
import zlib
import concurrent.futures
from tqdm import tqdm
from typing import Tuple, Union
from hashlib import sha256
from sys import exit
from functools import partial
from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CRYPT14_OFFSETS, Crypt, DbType
try:
@@ -112,13 +111,36 @@ def _decrypt_database(db_ciphertext: bytes, main_key: bytes, iv: bytes) -> bytes
zlib.error: If decompression fails.
ValueError: if the plaintext is not a SQLite database.
"""
FOOTER_SIZE = 32
if len(db_ciphertext) <= FOOTER_SIZE:
raise ValueError("Input data too short to contain a valid GCM tag.")
actual_ciphertext = db_ciphertext[:-FOOTER_SIZE]
tag = db_ciphertext[-FOOTER_SIZE: -FOOTER_SIZE + 16]
cipher = AES.new(main_key, AES.MODE_GCM, iv)
db_compressed = cipher.decrypt(db_ciphertext)
db = zlib.decompress(db_compressed)
if db[0:6].upper() != b"SQLITE":
try:
db_compressed = cipher.decrypt_and_verify(actual_ciphertext, tag)
except ValueError:
# This could be key, IV, or tag is wrong, but likely the key is wrong.
raise ValueError("Decryption/Authentication failed. Ensure you are using the correct key.")
if len(db_compressed) < 2 or db_compressed[0] != 0x78:
logger.debug(f"Data passes GCM but is not Zlib. Header: {db_compressed[:2].hex()}")
raise ValueError(
"The plaintext is not a SQLite database. Ensure you are using the correct key."
"Key is correct, but decrypted data is not a valid compressed stream. "
"Is this even a valid WhatsApp database backup?"
)
try:
db = zlib.decompress(db_compressed)
except zlib.error as e:
raise zlib.error(f"Decompression failed (The backup file likely corrupted at source): {e}")
if not db.startswith(b"SQLite"):
raise ValueError(
"Data is valid and decompressed, but it is not a SQLite database. "
"Is this even a valid WhatsApp database backup?")
return db
@@ -142,82 +164,69 @@ def _decrypt_crypt14(database: bytes, main_key: bytes, max_worker: int = 10) ->
# Attempt known offsets first
for offsets in CRYPT14_OFFSETS:
iv = database[offsets["iv"]:offsets["iv"] + 16]
db_ciphertext = database[offsets["db"]:]
iv = offsets["iv"]
db = offsets["db"]
try:
decrypted_db = _decrypt_database(db_ciphertext, main_key, iv)
decrypted_db = _attempt_decrypt_task((iv, iv + 16, db), database, main_key)
except (zlib.error, ValueError):
pass # Try next offset
continue
else:
logger.debug(
f"Decryption successful with known offsets: IV {offsets['iv']}, DB {offsets['db']}{CLEAR_LINE}"
f"Decryption successful with known offsets: IV {iv}, DB {db}{CLEAR_LINE}"
)
return decrypted_db # Successful decryption
def animate_message(stop_event):
base_msg = "Common offsets failed. Initiating brute-force with multithreading"
dots = ["", ".", "..", "..."]
i = 0
while not stop_event.is_set():
logger.info(f"{base_msg}{dots[i % len(dots)]}\x1b[K\r")
time.sleep(0.3)
i += 1
logger.info(f"Common offsets failed but brute-forcing the offset works!{CLEAR_LINE}")
stop_event = threading.Event()
anim_thread = threading.Thread(target=animate_message, args=(stop_event,))
anim_thread.start()
# Convert brute force generator into a list for parallel processing
offset_combinations = list(brute_force_offset())
def attempt_decrypt(offset_tuple):
"""Attempt decryption with the given offsets."""
start_iv, end_iv, start_db = offset_tuple
iv = database[start_iv:end_iv]
db_ciphertext = database[start_db:]
logger.debug(""f"Trying offsets: IV {start_iv}-{end_iv}, DB {start_db}{CLEAR_LINE}")
try:
db = _decrypt_database(db_ciphertext, main_key, iv)
except (zlib.error, ValueError):
return None # Decryption failed, move to next
else:
stop_event.set()
anim_thread.join()
logger.info(
f"The offsets of your IV and database are {start_iv} and "
f"{start_db}, respectively. To include your offsets in the "
"program, please report it by creating an issue on GitHub: "
"https://github.com/KnugiHK/Whatsapp-Chat-Exporter/discussions/47"
f"\nShutting down other threads...{CLEAR_LINE}"
)
return db
with concurrent.futures.ThreadPoolExecutor(max_worker) as executor:
future_to_offset = {executor.submit(attempt_decrypt, offset)
: offset for offset in offset_combinations}
try:
for future in concurrent.futures.as_completed(future_to_offset):
result = future.result()
if result is not None:
# Shutdown remaining threads
logger.info(f"Common offsets failed. Will attempt to brute-force{CLEAR_LINE}")
offset_max = 200
workers = max_worker
check_offset = partial(_attempt_decrypt_task, database=database, main_key=main_key)
all_offsets = list(brute_force_offset(offset_max, offset_max))
executor = concurrent.futures.ProcessPoolExecutor(max_workers=workers)
try:
with tqdm(total=len(all_offsets), desc="Brute-forcing offsets", unit="trial", leave=False) as pbar:
results = executor.map(check_offset, all_offsets, chunksize=8)
found = False
for offset_info, result in zip(all_offsets, results):
pbar.update(1)
if result:
start_iv, _, start_db = offset_info
# Clean shutdown on success
executor.shutdown(wait=False, cancel_futures=True)
return result
found = True
break
if found:
logger.info(
f"The offsets of your IV and database are {start_iv} and {start_db}, respectively.{CLEAR_LINE}"
)
logger.info(
f"To include your offsets in the expoter, please report it in the discussion thread on GitHub:{CLEAR_LINE}"
)
logger.info(f"https://github.com/KnugiHK/Whatsapp-Chat-Exporter/discussions/47{CLEAR_LINE}")
return result
except KeyboardInterrupt:
stop_event.set()
anim_thread.join()
logger.info(f"Brute force interrupted by user (Ctrl+C). Shutting down gracefully...{CLEAR_LINE}")
executor.shutdown(wait=False, cancel_futures=True)
exit(1)
finally:
stop_event.set()
anim_thread.join()
except KeyboardInterrupt:
executor.shutdown(wait=False, cancel_futures=True)
print("\n")
raise KeyboardInterrupt(
f"Brute force interrupted by user (Ctrl+C). Shutting down gracefully...{CLEAR_LINE}"
)
finally:
executor.shutdown(wait=False)
raise OffsetNotFoundError("Could not find the correct offsets for decryption.")
def _attempt_decrypt_task(offset_tuple, database, main_key):
"""Attempt decryption with the given offsets."""
start_iv, end_iv, start_db = offset_tuple
iv = database[start_iv:end_iv]
db_ciphertext = database[start_db:]
try:
return _decrypt_database(db_ciphertext, main_key, iv)
except (zlib.error, ValueError):
return None
def _decrypt_crypt12(database: bytes, main_key: bytes) -> bytes:
"""Decrypt a crypt12 database.

View File

@@ -4,13 +4,14 @@ import logging
import sqlite3
import os
import shutil
from tqdm import tqdm
from pathlib import Path
from mimetypes import MimeTypes
from markupsafe import escape as htmle
from base64 import b64decode, b64encode
from datetime import datetime
from Whatsapp_Chat_Exporter.data_model import ChatStore, Message
from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device
from Whatsapp_Chat_Exporter.data_model import ChatStore, Message, Timing
from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, MAX_SIZE, ROW_SIZE, JidType, Device
from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty
from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata
from Whatsapp_Chat_Exporter.utility import get_chat_condition, safe_name, bytes_to_readable
@@ -47,12 +48,15 @@ def contacts(db, data, enrich_from_vcards):
logger.info(f"Processed {total_row_number} contacts\n")
c.execute("SELECT jid, COALESCE(display_name, wa_name) as display_name, status FROM wa_contacts;")
row = c.fetchone()
while row is not None:
current_chat = data.add_chat(row["jid"], ChatStore(Device.ANDROID, row["display_name"]))
if row["status"] is not None:
current_chat.status = row["status"]
row = c.fetchone()
with tqdm(total=total_row_number, desc="Processing contacts", unit="contact", leave=False) as pbar:
while (row := _fetch_row_safely(c)) is not None:
current_chat = data.add_chat(row["jid"], ChatStore(Device.ANDROID, row["display_name"]))
if row["status"] is not None:
current_chat.status = row["status"]
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} contacts in {convert_time_unit(total_time)}{CLEAR_LINE}")
return True
@@ -72,7 +76,6 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat,
"""
c = db.cursor()
total_row_number = _get_message_count(c, filter_empty, filter_date, filter_chat)
logger.info(f"Processing messages...(0/{total_row_number})\r")
try:
content_cursor = _get_messages_cursor_legacy(c, filter_empty, filter_date, filter_chat)
@@ -84,23 +87,12 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat,
except Exception as e:
raise e
i = 0
# Fetch the first row safely
content = _fetch_row_safely(content_cursor)
while content is not None:
_process_single_message(data, content, table_message, timezone_offset)
i += 1
if i % 1000 == 0:
logger.info(f"Processing messages...({i}/{total_row_number})\r")
# Fetch the next row safely
content = _fetch_row_safely(content_cursor)
_get_reactions(db, data)
logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}")
with tqdm(total=total_row_number, desc="Processing messages", unit="msg", leave=False) as pbar:
while (content := _fetch_row_safely(content_cursor)) is not None:
_process_single_message(data, content, table_message, timezone_offset)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} messages in {convert_time_unit(total_time)}{CLEAR_LINE}")
# Helper functions for message processing
@@ -126,14 +118,16 @@ def _get_message_count(cursor, filter_empty, filter_date, filter_chat):
{include_filter}
{exclude_filter}""")
except sqlite3.OperationalError:
empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast")
empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast")
date_filter = f'AND timestamp {filter_date}' if filter_date is not None else ''
include_filter = get_chat_condition(
filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android")
filter_chat[0], True, ["key_remote_jid", "group_sender_jid"], "jid", "android")
exclude_filter = get_chat_condition(
filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android")
filter_chat[1], False, ["key_remote_jid", "group_sender_jid"], "jid", "android")
cursor.execute(f"""SELECT count()
cursor.execute(f"""SELECT count(),
COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid,
COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid
FROM message
LEFT JOIN chat
ON chat._id = message.chat_row_id
@@ -141,6 +135,14 @@ def _get_message_count(cursor, filter_empty, filter_date, filter_chat):
ON jid._id = chat.jid_row_id
LEFT JOIN jid jid_group
ON jid_group._id = message.sender_jid_row_id
LEFT JOIN jid_map as jid_map_global
ON chat.jid_row_id = jid_map_global.lid_row_id
LEFT JOIN jid lid_global
ON jid_map_global.jid_row_id = lid_global._id
LEFT JOIN jid_map as jid_map_group
ON message.sender_jid_row_id = jid_map_group.lid_row_id
LEFT JOIN jid lid_group
ON jid_map_group.jid_row_id = lid_group._id
WHERE 1=1
{empty_filter}
{date_filter}
@@ -219,11 +221,11 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat):
empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast")
date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else ''
include_filter = get_chat_condition(
filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android")
filter_chat[0], True, ["key_remote_jid", "lid_group.raw_string"], "jid_global", "android")
exclude_filter = get_chat_condition(
filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid_global", "android")
filter_chat[1], False, ["key_remote_jid", "lid_group.raw_string"], "jid_global", "android")
cursor.execute(f"""SELECT jid_global.raw_string as key_remote_jid,
cursor.execute(f"""SELECT COALESCE(lid_global.raw_string, jid_global.raw_string) as key_remote_jid,
message._id,
message.from_me as key_from_me,
message.timestamp,
@@ -238,7 +240,7 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat):
message.key_id,
message_quoted.text_data as quoted_data,
message.message_type as media_wa_type,
jid_group.raw_string as group_sender_jid,
COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid,
chat.subject as chat_subject,
missed_call_logs.video_call,
message.sender_jid_row_id,
@@ -248,7 +250,8 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat):
jid_new.raw_string as new_jid,
jid_global.type as jid_type,
COALESCE(receipt_user.receipt_timestamp, message.received_timestamp) as received_timestamp,
COALESCE(receipt_user.read_timestamp, receipt_user.played_timestamp) as read_timestamp
COALESCE(receipt_user.read_timestamp, receipt_user.played_timestamp) as read_timestamp,
message_media.raw_transcription_text as transcription_text
FROM message
LEFT JOIN message_quoted
ON message_quoted.message_row_id = message._id
@@ -280,6 +283,14 @@ def _get_messages_cursor_new(cursor, filter_empty, filter_date, filter_chat):
ON jid_new._id = message_system_number_change.new_jid_row_id
LEFT JOIN receipt_user
ON receipt_user.message_row_id = message._id
LEFT JOIN jid_map as jid_map_global
ON chat.jid_row_id = jid_map_global.lid_row_id
LEFT JOIN jid lid_global
ON jid_map_global.jid_row_id = lid_global._id
LEFT JOIN jid_map as jid_map_group
ON message.sender_jid_row_id = jid_map_group.lid_row_id
LEFT JOIN jid lid_group
ON jid_map_group.jid_row_id = lid_group._id
WHERE key_remote_jid <> '-1'
{empty_filter}
{date_filter}
@@ -321,7 +332,7 @@ def _process_single_message(data, content, table_message, timezone_offset):
timestamp=content["timestamp"],
time=content["timestamp"],
key_id=content["key_id"],
timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET,
timezone_offset=timezone_offset,
message_type=content["media_wa_type"],
received_timestamp=content["received_timestamp"],
read_timestamp=content["read_timestamp"]
@@ -353,9 +364,12 @@ def _process_single_message(data, content, table_message, timezone_offset):
if not table_message and content["media_caption"] is not None:
# Old schema
message.caption = content["media_caption"]
elif table_message and content["media_wa_type"] == 1 and content["data"] is not None:
elif table_message:
# New schema
message.caption = content["data"]
if content["media_wa_type"] == 1 and content["data"] is not None:
message.caption = content["data"]
elif content["media_wa_type"] == 2 and content["transcription_text"] is not None:
message.caption = f'"{content["transcription_text"]}"'
else:
message.caption = None
@@ -547,7 +561,7 @@ def _get_reactions(db, data):
logger.info(f"Processed reactions{CLEAR_LINE}")
def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=True):
def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=True, fix_dot_files=False):
"""
Process WhatsApp media files from the database.
@@ -562,8 +576,6 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa
"""
c = db.cursor()
total_row_number = _get_media_count(c, filter_empty, filter_date, filter_chat)
logger.info(f"Processing media...(0/{total_row_number})\r")
try:
content_cursor = _get_media_cursor_legacy(c, filter_empty, filter_date, filter_chat)
except sqlite3.OperationalError:
@@ -575,18 +587,12 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa
# Ensure thumbnails directory exists
Path(f"{media_folder}/thumbnails").mkdir(parents=True, exist_ok=True)
i = 0
while content is not None:
_process_single_media(data, content, media_folder, mime, separate_media)
i += 1
if i % 100 == 0:
logger.info(f"Processing media...({i}/{total_row_number})\r")
content = content_cursor.fetchone()
logger.info(f"Processed {total_row_number} media{CLEAR_LINE}")
with tqdm(total=total_row_number, desc="Processing media", unit="media", leave=False) as pbar:
while (content := _fetch_row_safely(content_cursor)) is not None:
_process_single_media(data, content, media_folder, mime, separate_media, fix_dot_files)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} media in {convert_time_unit(total_time)}{CLEAR_LINE}")
# Helper functions for media processing
@@ -617,11 +623,13 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat):
empty_filter = get_cond_for_empty(filter_empty, "jid.raw_string", "broadcast")
date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else ''
include_filter = get_chat_condition(
filter_chat[0], True, ["jid.raw_string", "jid_group.raw_string"], "jid", "android")
filter_chat[0], True, ["key_remote_jid", "group_sender_jid"], "jid", "android")
exclude_filter = get_chat_condition(
filter_chat[1], False, ["jid.raw_string", "jid_group.raw_string"], "jid", "android")
filter_chat[1], False, ["key_remote_jid", "group_sender_jid"], "jid", "android")
cursor.execute(f"""SELECT count()
cursor.execute(f"""SELECT count(),
COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid,
COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid
FROM message_media
INNER JOIN message
ON message_media.message_row_id = message._id
@@ -631,6 +639,14 @@ def _get_media_count(cursor, filter_empty, filter_date, filter_chat):
ON jid._id = chat.jid_row_id
LEFT JOIN jid jid_group
ON jid_group._id = message.sender_jid_row_id
LEFT JOIN jid_map as jid_map_global
ON chat.jid_row_id = jid_map_global.lid_row_id
LEFT JOIN jid lid_global
ON jid_map_global.jid_row_id = lid_global._id
LEFT JOIN jid_map as jid_map_group
ON message.sender_jid_row_id = jid_map_group.lid_row_id
LEFT JOIN jid lid_group
ON jid_map_group.jid_row_id = lid_group._id
WHERE 1=1
{empty_filter}
{date_filter}
@@ -679,18 +695,19 @@ def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat):
empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "broadcast")
date_filter = f'AND message.timestamp {filter_date}' if filter_date is not None else ''
include_filter = get_chat_condition(
filter_chat[0], True, ["key_remote_jid", "jid_group.raw_string"], "jid", "android")
filter_chat[0], True, ["key_remote_jid", "group_sender_jid"], "jid", "android")
exclude_filter = get_chat_condition(
filter_chat[1], False, ["key_remote_jid", "jid_group.raw_string"], "jid", "android")
filter_chat[1], False, ["key_remote_jid", "group_sender_jid"], "jid", "android")
cursor.execute(f"""SELECT jid.raw_string as key_remote_jid,
cursor.execute(f"""SELECT COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid,
message_row_id,
file_path,
message_url,
mime_type,
media_key,
file_hash,
thumbnail
thumbnail,
COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid
FROM message_media
INNER JOIN message
ON message_media.message_row_id = message._id
@@ -702,6 +719,14 @@ def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat):
ON message_media.file_hash = media_hash_thumbnail.media_hash
LEFT JOIN jid jid_group
ON jid_group._id = message.sender_jid_row_id
LEFT JOIN jid_map as jid_map_global
ON chat.jid_row_id = jid_map_global.lid_row_id
LEFT JOIN jid lid_global
ON jid_map_global.jid_row_id = lid_global._id
LEFT JOIN jid_map as jid_map_group
ON message.sender_jid_row_id = jid_map_group.lid_row_id
LEFT JOIN jid lid_group
ON jid_map_group.jid_row_id = lid_group._id
WHERE jid.type <> 7
{empty_filter}
{date_filter}
@@ -711,7 +736,7 @@ def _get_media_cursor_new(cursor, filter_empty, filter_date, filter_chat):
return cursor
def _process_single_media(data, content, media_folder, mime, separate_media):
def _process_single_media(data, content, media_folder, mime, separate_media, fix_dot_files=False):
"""Process a single media file."""
file_path = f"{media_folder}/{content['file_path']}"
current_chat = data.get_chat(content["key_remote_jid"])
@@ -719,8 +744,6 @@ def _process_single_media(data, content, media_folder, mime, separate_media):
message.media = True
if os.path.isfile(file_path):
message.data = file_path
# Set mime type
if content["mime_type"] is None:
guess = mime.guess_type(file_path)[0]
@@ -730,6 +753,16 @@ def _process_single_media(data, content, media_folder, mime, separate_media):
message.mime = "application/octet-stream"
else:
message.mime = content["mime_type"]
if fix_dot_files and file_path.endswith("."):
extension = mime.guess_extension(message.mime)
if message.mime == "application/octet-stream" or not extension:
new_file_path = file_path[:-1]
else:
extension = mime.guess_extension(message.mime)
new_file_path = file_path[:-1] + extension
os.rename(file_path, new_file_path)
file_path = new_file_path
# Copy media to separate folder if needed
if separate_media:
@@ -741,6 +774,8 @@ def _process_single_media(data, content, media_folder, mime, separate_media):
new_path = os.path.join(new_folder, current_filename)
shutil.copy2(file_path, new_path)
message.data = new_path
else:
message.data = file_path
else:
message.data = "The media is missing"
message.mime = "media"
@@ -764,45 +799,56 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty):
rows = _execute_vcard_query_legacy(c, filter_date, filter_chat, filter_empty)
total_row_number = len(rows)
logger.info(f"Processing vCards...(0/{total_row_number})\r")
# Create vCards directory if it doesn't exist
path = os.path.join(media_folder, "vCards")
Path(path).mkdir(parents=True, exist_ok=True)
for index, row in enumerate(rows):
_process_vcard_row(row, path, data)
logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r")
logger.info(f"Processed {total_row_number} vCards{CLEAR_LINE}")
with tqdm(total=total_row_number, desc="Processing vCards", unit="vcard", leave=False) as pbar:
for row in rows:
_process_vcard_row(row, path, data)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} vCards in {convert_time_unit(total_time)}{CLEAR_LINE}")
def _execute_vcard_query_modern(c, filter_date, filter_chat, filter_empty):
"""Execute vCard query for modern WhatsApp database schema."""
# Build the filter conditions
chat_filter_include = get_chat_condition(
filter_chat[0], True, ["messages.key_remote_jid", "remote_resource"], "jid", "android")
chat_filter_exclude = get_chat_condition(
filter_chat[1], False, ["messages.key_remote_jid", "remote_resource"], "jid", "android")
date_filter = f'AND messages.timestamp {filter_date}' if filter_date is not None else ''
empty_filter = get_cond_for_empty(filter_empty, "key_remote_jid", "messages.needs_push")
include_filter = get_chat_condition(
filter_chat[0], True, ["key_remote_jid", "group_sender_jid"], "jid", "android")
exclude_filter = get_chat_condition(
filter_chat[1], False, ["key_remote_jid", "group_sender_jid"], "jid", "android")
query = f"""SELECT message_row_id,
messages.key_remote_jid,
vcard,
messages.media_name
FROM messages_vcards
INNER JOIN messages
ON messages_vcards.message_row_id = messages._id
INNER JOIN jid
ON messages.key_remote_jid = jid.raw_string
LEFT JOIN chat
ON chat.jid_row_id = jid._id
COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid,
vcard,
messages.media_name,
COALESCE(lid_group.raw_string, jid_group.raw_string) as group_sender_jid
FROM messages_vcards
INNER JOIN messages
ON messages_vcards.message_row_id = messages._id
INNER JOIN jid
ON messages.key_remote_jid = jid.raw_string
LEFT JOIN chat
ON chat.jid_row_id = jid._id
LEFT JOIN jid jid_group
ON jid_group._id = message.sender_jid_row_id
LEFT JOIN jid_map as jid_map_global
ON chat.jid_row_id = jid_map_global.lid_row_id
LEFT JOIN jid lid_global
ON jid_map_global.jid_row_id = lid_global._id
LEFT JOIN jid_map as jid_map_group
ON message.sender_jid_row_id = jid_map_group.lid_row_id
LEFT JOIN jid lid_group
ON jid_map_group.jid_row_id = lid_group._id
WHERE 1=1
{empty_filter}
{date_filter}
{chat_filter_include}
{chat_filter_exclude}
{include_filter}
{exclude_filter}
ORDER BY messages.key_remote_jid ASC;"""
c.execute(query)
return c.fetchall()
@@ -879,32 +925,37 @@ def calls(db, data, timezone_offset, filter_chat):
chat = ChatStore(Device.ANDROID, "WhatsApp Calls")
# Process each call
content = calls_data.fetchone()
while content is not None:
_process_call_record(content, chat, data, timezone_offset)
content = calls_data.fetchone()
with tqdm(total=total_row_number, desc="Processing calls", unit="call", leave=False) as pbar:
while (content := _fetch_row_safely(calls_data)) is not None:
_process_call_record(content, chat, data, timezone_offset)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
# Add the calls chat to the data
data.add_chat("000000000000000", chat)
logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}")
logger.info(f"Processed {total_row_number} calls in {convert_time_unit(total_time)}{CLEAR_LINE}")
def _get_calls_count(c, filter_chat):
"""Get the count of call records that match the filter."""
# Build the filter conditions
chat_filter_include = get_chat_condition(filter_chat[0], True, ["jid.raw_string"])
chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["jid.raw_string"])
include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid"])
exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid"])
query = f"""SELECT count()
query = f"""SELECT count(),
COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid
FROM call_log
INNER JOIN jid
ON call_log.jid_row_id = jid._id
LEFT JOIN chat
ON call_log.jid_row_id = chat.jid_row_id
LEFT JOIN jid_map as jid_map_global
ON chat.jid_row_id = jid_map_global.lid_row_id
LEFT JOIN jid lid_global
ON jid_map_global.jid_row_id = lid_global._id
WHERE 1=1
{chat_filter_include}
{chat_filter_exclude}"""
{include_filter}
{exclude_filter}"""
c.execute(query)
return c.fetchone()[0]
@@ -913,11 +964,11 @@ def _fetch_calls_data(c, filter_chat):
"""Fetch call data from the database."""
# Build the filter conditions
chat_filter_include = get_chat_condition(filter_chat[0], True, ["jid.raw_string"])
chat_filter_exclude = get_chat_condition(filter_chat[1], False, ["jid.raw_string"])
include_filter = get_chat_condition(filter_chat[0], True, ["key_remote_jid"])
exclude_filter = get_chat_condition(filter_chat[1], False, ["key_remote_jid"])
query = f"""SELECT call_log._id,
jid.raw_string,
COALESCE(lid_global.raw_string, jid.raw_string) as key_remote_jid,
from_me,
call_id,
timestamp,
@@ -931,9 +982,13 @@ def _fetch_calls_data(c, filter_chat):
ON call_log.jid_row_id = jid._id
LEFT JOIN chat
ON call_log.jid_row_id = chat.jid_row_id
LEFT JOIN jid_map as jid_map_global
ON chat.jid_row_id = jid_map_global.lid_row_id
LEFT JOIN jid lid_global
ON jid_map_global.jid_row_id = lid_global._id
WHERE 1=1
{chat_filter_include}
{chat_filter_exclude}"""
{include_filter}
{exclude_filter}"""
c.execute(query)
return c
@@ -945,13 +1000,13 @@ def _process_call_record(content, chat, data, timezone_offset):
timestamp=content["timestamp"],
time=content["timestamp"],
key_id=content["call_id"],
timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET,
timezone_offset=timezone_offset,
received_timestamp=None, # TODO: Add timestamp
read_timestamp=None # TODO: Add timestamp
)
# Get caller/callee name
_jid = content["raw_string"]
_jid = content["key_remote_jid"]
name = data.get_chat(_jid).name if _jid in data else content["chat_subject"] or None
if _jid is not None and "@" in _jid:
fallback = _jid.split('@')[0]
@@ -996,6 +1051,7 @@ def _construct_call_description(content, call):
return description
# TODO: Marked for enhancement on multi-threaded processing
def create_html(
data,
output_folder,
@@ -1011,7 +1067,6 @@ def create_html(
template = setup_template(template, no_avatar, experimental)
total_row_number = len(data)
logger.info(f"Generating chats...(0/{total_row_number})\r")
# Create output directory if it doesn't exist
if not os.path.isdir(output_folder):
@@ -1019,43 +1074,42 @@ def create_html(
w3css = get_status_location(output_folder, offline_static)
for current, contact in enumerate(data):
current_chat = data.get_chat(contact)
if len(current_chat) == 0:
# Skip empty chats
continue
with tqdm(total=total_row_number, desc="Generating HTML", unit="file", leave=False) as pbar:
for contact in data:
current_chat = data.get_chat(contact)
if len(current_chat) == 0:
# Skip empty chats
continue
safe_file_name, name = get_file_name(contact, current_chat)
safe_file_name, name = get_file_name(contact, current_chat)
if maximum_size is not None:
_generate_paginated_chat(
current_chat,
safe_file_name,
name,
contact,
output_folder,
template,
w3css,
maximum_size,
headline
)
else:
_generate_single_chat(
current_chat,
safe_file_name,
name,
contact,
output_folder,
template,
w3css,
headline
)
if current % 10 == 0:
logger.info(f"Generating chats...({current}/{total_row_number})\r")
logger.info(f"Generated {total_row_number} chats{CLEAR_LINE}")
if maximum_size is not None:
_generate_paginated_chat(
current_chat,
safe_file_name,
name,
contact,
output_folder,
template,
w3css,
maximum_size,
headline
)
else:
_generate_single_chat(
current_chat,
safe_file_name,
name,
contact,
output_folder,
template,
w3css,
headline
)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Generated {total_row_number} chats in {convert_time_unit(total_time)}{CLEAR_LINE}")
def _generate_single_chat(current_chat, safe_file_name, name, contact, output_folder, template, w3css, headline):
"""Generate a single HTML file for a chat."""

View File

@@ -279,7 +279,7 @@ class Message:
key_id: Union[int, str],
received_timestamp: int = None,
read_timestamp: int = None,
timezone_offset: int = 0,
timezone_offset: Optional[Timing] = Timing(0),
message_type: Optional[int] = None
) -> None:
"""
@@ -300,10 +300,9 @@ class Message:
"""
self.from_me = bool(from_me)
self.timestamp = timestamp / 1000 if timestamp > 9999999999 else timestamp
timing = Timing(timezone_offset)
if isinstance(time, (int, float)):
self.time = timing.format_timestamp(self.timestamp, "%H:%M")
self.time = timezone_offset.format_timestamp(self.timestamp, "%H:%M")
elif isinstance(time, str):
self.time = time
else:
@@ -318,14 +317,14 @@ class Message:
self.mime = None
self.message_type = message_type
if isinstance(received_timestamp, (int, float)):
self.received_timestamp = timing.format_timestamp(
self.received_timestamp = timezone_offset.format_timestamp(
received_timestamp, "%Y/%m/%d %H:%M")
elif isinstance(received_timestamp, str):
self.received_timestamp = received_timestamp
else:
self.received_timestamp = None
if isinstance(read_timestamp, (int, float)):
self.read_timestamp = timing.format_timestamp(
self.read_timestamp = timezone_offset.format_timestamp(
read_timestamp, "%Y/%m/%d %H:%M")
elif isinstance(read_timestamp, str):
self.read_timestamp = read_timestamp

View File

@@ -4,8 +4,9 @@ import os
import logging
from datetime import datetime
from mimetypes import MimeTypes
from tqdm import tqdm
from Whatsapp_Chat_Exporter.data_model import ChatStore, Message
from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, Device
from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, Device, convert_time_unit
logger = logging.getLogger(__name__)
@@ -34,17 +35,16 @@ def messages(path, data, assume_first_as_me=False):
# Second pass: process the messages
with open(path, "r", encoding="utf8") as file:
for index, line in enumerate(file):
you, user_identification_done = process_line(
line, index, chat, path, you,
assume_first_as_me, user_identification_done
)
with tqdm(total=total_row_number, desc="Processing messages & media", unit="msg&media", leave=False) as pbar:
for index, line in enumerate(file):
you, user_identification_done = process_line(
line, index, chat, path, you,
assume_first_as_me, user_identification_done
)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} messages & media in {convert_time_unit(total_time)}{CLEAR_LINE}")
# Show progress
if index % 1000 == 0:
logger.info(f"Processing messages & media...({index}/{total_row_number})\r")
logger.info(f"Processed {total_row_number} messages & media{CLEAR_LINE}")
return data

View File

@@ -4,12 +4,13 @@ import os
import logging
import shutil
from glob import glob
from tqdm import tqdm
from pathlib import Path
from mimetypes import MimeTypes
from markupsafe import escape as htmle
from Whatsapp_Chat_Exporter.data_model import ChatStore, Message
from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition
from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name, Device
from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, get_chat_condition, Device
from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name
logger = logging.getLogger(__name__)
@@ -23,17 +24,18 @@ def contacts(db, data):
logger.info(f"Pre-processing contacts...({total_row_number})\r")
c.execute("""SELECT ZWHATSAPPID, ZABOUTTEXT FROM ZWAADDRESSBOOKCONTACT WHERE ZABOUTTEXT IS NOT NULL""")
content = c.fetchone()
while content is not None:
zwhatsapp_id = content["ZWHATSAPPID"]
if not zwhatsapp_id.endswith("@s.whatsapp.net"):
zwhatsapp_id += "@s.whatsapp.net"
with tqdm(total=total_row_number, desc="Processing contacts", unit="contact", leave=False) as pbar:
while (content := c.fetchone()) is not None:
zwhatsapp_id = content["ZWHATSAPPID"]
if not zwhatsapp_id.endswith("@s.whatsapp.net"):
zwhatsapp_id += "@s.whatsapp.net"
current_chat = ChatStore(Device.IOS)
current_chat.status = content["ZABOUTTEXT"]
data.add_chat(zwhatsapp_id, current_chat)
content = c.fetchone()
logger.info(f"Pre-processed {total_row_number} contacts{CLEAR_LINE}")
current_chat = ChatStore(Device.IOS)
current_chat.status = content["ZABOUTTEXT"]
data.add_chat(zwhatsapp_id, current_chat)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Pre-processed {total_row_number} contacts in {convert_time_unit(total_time)}{CLEAR_LINE}")
def process_contact_avatars(current_chat, media_folder, contact_id):
@@ -92,7 +94,6 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat,
"""
c.execute(contact_query)
total_row_number = c.fetchone()[0]
logger.info(f"Processing contacts...({total_row_number})\r")
# Get distinct contacts
contacts_query = f"""
@@ -114,24 +115,24 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat,
c.execute(contacts_query)
# Process each contact
content = c.fetchone()
while content is not None:
contact_name = get_contact_name(content)
contact_id = content["ZCONTACTJID"]
with tqdm(total=total_row_number, desc="Processing contacts", unit="contact", leave=False) as pbar:
while (content := c.fetchone()) is not None:
contact_name = get_contact_name(content)
contact_id = content["ZCONTACTJID"]
# Add or update chat
if contact_id not in data:
current_chat = data.add_chat(contact_id, ChatStore(Device.IOS, contact_name, media_folder))
else:
current_chat = data.get_chat(contact_id)
current_chat.name = contact_name
current_chat.my_avatar = os.path.join(media_folder, "Media/Profile/Photo.jpg")
# Add or update chat
if contact_id not in data:
current_chat = data.add_chat(contact_id, ChatStore(Device.IOS, contact_name, media_folder))
else:
current_chat = data.get_chat(contact_id)
current_chat.name = contact_name
current_chat.my_avatar = os.path.join(media_folder, "Media/Profile/Photo.jpg")
# Process avatar images
process_contact_avatars(current_chat, media_folder, contact_id)
content = c.fetchone()
logger.info(f"Processed {total_row_number} contacts{CLEAR_LINE}")
# Process avatar images
process_contact_avatars(current_chat, media_folder, contact_id)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} contacts in {convert_time_unit(total_time)}{CLEAR_LINE}")
# Get message count
message_count_query = f"""
@@ -190,46 +191,42 @@ def messages(db, data, media_folder, timezone_offset, filter_date, filter_chat,
message_map = {row[0][:17]: row[1] or row[2] for row in cursor2.fetchall() if row[0]}
# Process each message
i = 0
content = c.fetchone()
while content is not None:
contact_id = content["ZCONTACTJID"]
message_pk = content["Z_PK"]
is_group_message = content["ZGROUPINFO"] is not None
with tqdm(total=total_row_number, desc="Processing messages", unit="msg", leave=False) as pbar:
while (content := c.fetchone()) is not None:
contact_id = content["ZCONTACTJID"]
message_pk = content["Z_PK"]
is_group_message = content["ZGROUPINFO"] is not None
# Ensure chat exists
if contact_id not in data:
current_chat = data.add_chat(contact_id, ChatStore(Device.IOS))
process_contact_avatars(current_chat, media_folder, contact_id)
else:
current_chat = data.get_chat(contact_id)
# Ensure chat exists
if contact_id not in data:
current_chat = data.add_chat(contact_id, ChatStore(Device.IOS))
process_contact_avatars(current_chat, media_folder, contact_id)
else:
current_chat = data.get_chat(contact_id)
# Create message object
ts = APPLE_TIME + content["ZMESSAGEDATE"]
message = Message(
from_me=content["ZISFROMME"],
timestamp=ts,
time=ts,
key_id=content["ZSTANZAID"][:17],
timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET,
message_type=content["ZMESSAGETYPE"],
received_timestamp=APPLE_TIME + content["ZSENTDATE"] if content["ZSENTDATE"] else None,
read_timestamp=None # TODO: Add timestamp
)
# Create message object
ts = APPLE_TIME + content["ZMESSAGEDATE"]
message = Message(
from_me=content["ZISFROMME"],
timestamp=ts,
time=ts,
key_id=content["ZSTANZAID"][:17],
timezone_offset=timezone_offset,
message_type=content["ZMESSAGETYPE"],
received_timestamp=APPLE_TIME + content["ZSENTDATE"] if content["ZSENTDATE"] else None,
read_timestamp=None # TODO: Add timestamp
)
# Process message data
invalid = process_message_data(message, content, is_group_message, data, message_map, no_reply)
# Process message data
invalid = process_message_data(message, content, is_group_message, data, message_map, no_reply)
# Add valid messages to chat
if not invalid:
current_chat.add_message(message_pk, message)
# Add valid messages to chat
if not invalid:
current_chat.add_message(message_pk, message)
# Update progress
i += 1
if i % 1000 == 0:
logger.info(f"Processing messages...({i}/{total_row_number})\r")
content = c.fetchone()
logger.info(f"Processed {total_row_number} messages{CLEAR_LINE}")
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} messages in {convert_time_unit(total_time)}{CLEAR_LINE}")
def process_message_data(message, content, is_group_message, data, message_map, no_reply):
@@ -315,7 +312,7 @@ def process_message_text(message, content):
message.data = msg
def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=False):
def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separate_media=False, fix_dot_files=False):
"""Process media files from WhatsApp messages."""
c = db.cursor()
@@ -371,20 +368,15 @@ def media(db, data, media_folder, filter_date, filter_chat, filter_empty, separa
# Process each media item
mime = MimeTypes()
i = 0
content = c.fetchone()
while content is not None:
process_media_item(content, data, media_folder, mime, separate_media)
# Update progress
i += 1
if i % 100 == 0:
logger.info(f"Processing media...({i}/{total_row_number})\r")
content = c.fetchone()
logger.info(f"Processed {total_row_number} media{CLEAR_LINE}")
with tqdm(total=total_row_number, desc="Processing media", unit="media", leave=False) as pbar:
while (content := c.fetchone()) is not None:
process_media_item(content, data, media_folder, mime, separate_media, fix_dot_files)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} media in {convert_time_unit(total_time)}{CLEAR_LINE}")
def process_media_item(content, data, media_folder, mime, separate_media):
def process_media_item(content, data, media_folder, mime, separate_media, fix_dot_files=False):
"""Process a single media item."""
file_path = f"{media_folder}/Message/{content['ZMEDIALOCALPATH']}"
current_chat = data.get_chat(content["ZCONTACTJID"])
@@ -395,14 +387,22 @@ def process_media_item(content, data, media_folder, mime, separate_media):
current_chat.media_base = media_folder + "/"
if os.path.isfile(file_path):
message.data = '/'.join(file_path.split("/")[1:])
# Set MIME type
if content["ZVCARDSTRING"] is None:
guess = mime.guess_type(file_path)[0]
message.mime = guess if guess is not None else "application/octet-stream"
else:
message.mime = content["ZVCARDSTRING"]
if fix_dot_files and file_path.endswith("."):
extension = mime.guess_extension(message.mime)
if message.mime == "application/octet-stream" or not extension:
new_file_path = file_path[:-1]
else:
extension = mime.guess_extension(message.mime)
new_file_path = file_path[:-1] + extension
os.rename(file_path, new_file_path)
file_path = new_file_path
# Handle separate media option
if separate_media:
@@ -413,7 +413,9 @@ def process_media_item(content, data, media_folder, mime, separate_media):
Path(new_folder).mkdir(parents=True, exist_ok=True)
new_path = os.path.join(new_folder, current_filename)
shutil.copy2(file_path, new_path)
message.data = '/'.join(new_path.split("\\")[1:])
message.data = '/'.join(new_path.split("/")[1:])
else:
message.data = '/'.join(file_path.split("/")[1:])
else:
# Handle missing media
message.data = "The media is missing"
@@ -467,10 +469,12 @@ def vcard(db, data, media_folder, filter_date, filter_chat, filter_empty):
Path(path).mkdir(parents=True, exist_ok=True)
# Process each vCard
for index, content in enumerate(contents):
process_vcard_item(content, path, data)
logger.info(f"Processing vCards...({index + 1}/{total_row_number})\r")
logger.info(f"Processed {total_row_number} vCards{CLEAR_LINE}")
with tqdm(total=total_row_number, desc="Processing vCards", unit="vcard", leave=False) as pbar:
for content in contents:
process_vcard_item(content, path, data)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Processed {total_row_number} vCards in {convert_time_unit(total_time)}{CLEAR_LINE}")
def process_vcard_item(content, path, data):
@@ -530,8 +534,6 @@ def calls(db, data, timezone_offset, filter_chat):
if total_row_number == 0:
return
logger.info(f"Processed {total_row_number} calls{CLEAR_LINE}\n")
# Fetch call records
calls_query = f"""
SELECT ZCALLIDSTRING,
@@ -556,14 +558,15 @@ def calls(db, data, timezone_offset, filter_chat):
# Create calls chat
chat = ChatStore(Device.ANDROID, "WhatsApp Calls")
# Process each call
content = c.fetchone()
while content is not None:
process_call_record(content, chat, data, timezone_offset)
content = c.fetchone()
with tqdm(total=total_row_number, desc="Processing calls", unit="call", leave=False) as pbar:
while (content := c.fetchone()) is not None:
process_call_record(content, chat, data, timezone_offset)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
# Add calls chat to data
data.add_chat("000000000000000", chat)
logger.info(f"Processed {total_row_number} calls in {convert_time_unit(total_time)}{CLEAR_LINE}")
def process_call_record(content, chat, data, timezone_offset):
@@ -574,7 +577,7 @@ def process_call_record(content, chat, data, timezone_offset):
timestamp=ts,
time=ts,
key_id=content["ZCALLIDSTRING"],
timezone_offset=timezone_offset if timezone_offset else CURRENT_TZ_OFFSET
timezone_offset=timezone_offset
)
# Set sender info

View File

@@ -6,7 +6,9 @@ import sqlite3
import os
import getpass
from sys import exit, platform as osname
from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, WhatsAppIdentifier
import sys
from tqdm import tqdm
from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, WhatsAppIdentifier, convert_time_unit
from Whatsapp_Chat_Exporter.bplist import BPListReader
try:
from iphone_backup_decrypt import EncryptedBackup, RelativePath
@@ -79,6 +81,8 @@ class BackupExtractor:
logger.info(f"Encryption detected on the backup!{CLEAR_LINE}")
password = getpass.getpass("Enter the password for the backup:")
sys.stdout.write("\033[F\033[K")
sys.stdout.flush()
self._decrypt_backup(password)
self._extract_decrypted_files()
@@ -89,7 +93,7 @@ class BackupExtractor:
Args:
password (str): The password for the encrypted backup.
"""
logger.info(f"Trying to decrypt the iOS backup...{CLEAR_LINE}")
logger.info(f"Trying to open the iOS backup...{CLEAR_LINE}")
self.backup = EncryptedBackup(
backup_directory=self.base_dir,
passphrase=password,
@@ -97,7 +101,7 @@ class BackupExtractor:
check_same_thread=False,
decrypt_chunk_size=self.decrypt_chunk_size,
)
logger.info(f"iOS backup decrypted successfully{CLEAR_LINE}")
logger.info(f"iOS backup is opened successfully{CLEAR_LINE}")
logger.info("Decrypting WhatsApp database...\r")
try:
self.backup.extract_file(
@@ -130,9 +134,12 @@ class BackupExtractor:
def _extract_decrypted_files(self):
"""Extract all WhatsApp files after decryption"""
pbar = tqdm(desc="Decrypting and extracting files", unit="file", leave=False)
def extract_progress_handler(file_id, domain, relative_path, n, total_files):
if n % 100 == 0:
logger.info(f"Decrypting and extracting files...({n}/{total_files})\r")
if pbar.total is None:
pbar.total = total_files
pbar.n = n
pbar.refresh()
return True
self.backup.extract_files(
@@ -141,7 +148,9 @@ class BackupExtractor:
preserve_folders=True,
filter_callback=extract_progress_handler
)
logger.info(f"All required files are decrypted and extracted.{CLEAR_LINE}")
total_time = pbar.format_dict['elapsed']
pbar.close()
logger.info(f"All required files are decrypted and extracted in {convert_time_unit(total_time)}{CLEAR_LINE}")
def _extract_unencrypted_backup(self):
"""
@@ -192,7 +201,6 @@ class BackupExtractor:
c = manifest.cursor()
c.execute(f"SELECT count() FROM Files WHERE domain = '{_wts_id}'")
total_row_number = c.fetchone()[0]
logger.info(f"Extracting WhatsApp files...(0/{total_row_number})\r")
c.execute(
f"""
SELECT fileID, relativePath, flags, file AS metadata,
@@ -205,33 +213,30 @@ class BackupExtractor:
if not os.path.isdir(_wts_id):
os.mkdir(_wts_id)
row = c.fetchone()
while row is not None:
if not row["relativePath"]: # Skip empty relative paths
row = c.fetchone()
continue
with tqdm(total=total_row_number, desc="Extracting WhatsApp files", unit="file", leave=False) as pbar:
while (row := c.fetchone()) is not None:
if not row["relativePath"]: # Skip empty relative paths
continue
destination = os.path.join(_wts_id, row["relativePath"])
hashes = row["fileID"]
folder = hashes[:2]
flags = row["flags"]
destination = os.path.join(_wts_id, row["relativePath"])
hashes = row["fileID"]
folder = hashes[:2]
flags = row["flags"]
if flags == 2: # Directory
try:
os.mkdir(destination)
except FileExistsError:
pass
elif flags == 1: # File
shutil.copyfile(os.path.join(self.base_dir, folder, hashes), destination)
metadata = BPListReader(row["metadata"]).parse()
creation = metadata["$objects"][1]["Birth"]
modification = metadata["$objects"][1]["LastModified"]
os.utime(destination, (modification, modification))
if row["_index"] % 100 == 0:
logger.info(f"Extracting WhatsApp files...({row['_index']}/{total_row_number})\r")
row = c.fetchone()
logger.info(f"Extracted WhatsApp files...({total_row_number}){CLEAR_LINE}")
if flags == 2: # Directory
try:
os.mkdir(destination)
except FileExistsError:
pass
elif flags == 1: # File
shutil.copyfile(os.path.join(self.base_dir, folder, hashes), destination)
metadata = BPListReader(row["metadata"]).parse()
_creation = metadata["$objects"][1]["Birth"]
modification = metadata["$objects"][1]["LastModified"]
os.utime(destination, (modification, modification))
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Extracted {total_row_number} WhatsApp files in {convert_time_unit(total_time)}{CLEAR_LINE}")
def extract_media(base_dir, identifiers, decrypt_chunk_size):

View File

@@ -5,13 +5,13 @@ import json
import os
import unicodedata
import re
import string
import math
import shutil
from bleach import clean as sanitize
from markupsafe import Markup
from datetime import datetime, timedelta
from enum import IntEnum
from tqdm import tqdm
from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore, Timing
from typing import Dict, List, Optional, Tuple, Union
try:
@@ -248,13 +248,13 @@ def import_from_json(json_file: str, data: ChatCollection):
with open(json_file, "r") as f:
temp_data = json.loads(f.read())
total_row_number = len(tuple(temp_data.keys()))
logger.info(f"Importing chats from JSON...(0/{total_row_number})\r")
for index, (jid, chat_data) in enumerate(temp_data.items()):
chat = ChatStore.from_json(chat_data)
data.add_chat(jid, chat)
logger.info(
f"Importing chats from JSON...({index + 1}/{total_row_number})\r")
logger.info(f"Imported {total_row_number} chats from JSON{CLEAR_LINE}")
with tqdm(total=total_row_number, desc="Importing chats from JSON", unit="chat", leave=False) as pbar:
for jid, chat_data in temp_data.items():
chat = ChatStore.from_json(chat_data)
data.add_chat(jid, chat)
pbar.update(1)
total_time = pbar.format_dict['elapsed']
logger.info(f"Imported {total_row_number} chats from JSON in {convert_time_unit(total_time)}{CLEAR_LINE}")
def incremental_merge(source_dir: str, target_dir: str, media_dir: str, pretty_print_json: int, avoid_encoding_json: bool):
@@ -439,7 +439,7 @@ CRYPT14_OFFSETS = (
{"iv": 67, "db": 193},
{"iv": 67, "db": 194},
{"iv": 67, "db": 158},
{"iv": 67, "db": 196}
{"iv": 67, "db": 196},
)
@@ -534,7 +534,7 @@ def determine_metadata(content: sqlite3.Row, init_msg: Optional[str]) -> Optiona
else:
msg = "The security code in this chat changed"
elif content["action_type"] == 58:
msg = "You blocked this contact"
msg = "You blocked/unblocked this contact"
elif content["action_type"] == 67:
return # (PM) this contact use secure service from Facebook???
elif content["action_type"] == 69:
@@ -639,11 +639,17 @@ def get_from_string(msg: Dict, chat_id: str) -> str:
def get_chat_type(chat_id: str) -> str:
"""Return the chat type based on the whatsapp id"""
if chat_id.endswith("@s.whatsapp.net"):
if chat_id == "000000000000000":
return "calls"
elif chat_id.endswith("@s.whatsapp.net"):
return "personal_chat"
if chat_id.endswith("@g.us"):
elif chat_id.endswith("@g.us"):
return "private_group"
logger.warning("Unknown chat type for %s, defaulting to private_group", chat_id)
elif chat_id == "status@broadcast":
return "status_broadcast"
elif chat_id.endswith("@broadcast"):
return "broadcast_channel"
logger.warning(f"Unknown chat type for {chat_id}, defaulting to private_group{CLEAR_LINE}")
return "private_group"
@@ -674,34 +680,35 @@ def telegram_json_format(jik: str, data: Dict, timezone_offset) -> Dict:
except ValueError:
# not a real chat: e.g. statusbroadcast
chat_id = 0
obj = {
"name": data["name"] if data["name"] else jik,
"type": get_chat_type(jik),
"id": chat_id,
"messages": [ {
"id": int(msgId),
"type": "message",
"date": timing.format_timestamp(msg["timestamp"], "%Y-%m-%dT%H:%M:%S"),
"date_unixtime": int(msg["timestamp"]),
"from": get_from_string(msg, chat_id),
"from_id": get_from_id(msg, chat_id),
"reply_to_message_id": get_reply_id(data, msg["reply"]),
"text": msg["data"],
"text_entities": [
{
# TODO this will lose formatting and different types
"type": "plain",
"text": msg["data"],
}
],
} for msgId, msg in data["messages"].items()]
json_obj = {
"name": data["name"] if data["name"] else jik,
"type": get_chat_type(jik),
"id": chat_id,
"messages": [ {
"id": int(msgId),
"type": "message",
"date": timing.format_timestamp(msg["timestamp"], "%Y-%m-%dT%H:%M:%S"),
"date_unixtime": int(msg["timestamp"]),
"from": get_from_string(msg, chat_id),
"from_id": get_from_id(msg, chat_id),
"reply_to_message_id": get_reply_id(data, msg["reply"]),
"text": msg["data"],
"text_entities": [
{
# TODO this will lose formatting and different types
"type": "plain",
"text": msg["data"],
}
],
}
for msgId, msg in data["messages"].items()]
}
# remove empty messages and replies
for msg_id, msg in enumerate(obj["messages"]):
for msg_id, msg in enumerate(json_obj["messages"]):
if not msg["reply_to_message_id"]:
del obj["messages"][msg_id]["reply_to_message_id"]
obj["messages"] = [m for m in obj["messages"] if m["text"]]
return obj
del json_obj["messages"][msg_id]["reply_to_message_id"]
json_obj["messages"] = [m for m in json_obj["messages"] if m["text"]]
return json_obj
class WhatsAppIdentifier(StrEnum):

View File

@@ -281,7 +281,9 @@
{% filter escape %}{{ msg.data }}{% endfilter %}
{% endif %}
{% if msg.caption is not none %}
{{ msg.caption | urlize(none, true, '_blank') }}
<p class='mt-1 {% if "audio/" in msg.mime %}text-[#808080]{% endif %}'>
{{ msg.caption | urlize(none, true, '_blank') }}
</p>
{% endif %}
{% endif %}
{% endif %}
@@ -351,7 +353,9 @@
{% filter escape %}{{ msg.data }}{% endfilter %}
{% endif %}
{% if msg.caption is not none %}
{{ msg.caption | urlize(none, true, '_blank') }}
<p class='mt-1 {% if "audio/" in msg.mime %}text-[#808080]{% endif %}'>
{{ msg.caption | urlize(none, true, '_blank') }}
</p>
{% endif %}
{% endif %}
{% endif %}

View File

@@ -36,17 +36,19 @@ classifiers = [
requires-python = ">=3.10"
dependencies = [
"jinja2",
"bleach"
"bleach",
"tqdm"
]
[project.optional-dependencies]
android_backup = ["pycryptodome", "javaobj-py3"]
ios_backup = ["iphone_backup_decrypt @ git+https://github.com/KnugiHK/iphone_backup_decrypt"]
crypt12 = ["pycryptodome"]
crypt14 = ["pycryptodome"]
crypt15 = ["pycryptodome", "javaobj-py3"]
all = ["pycryptodome", "javaobj-py3"]
everything = ["pycryptodome", "javaobj-py3"]
backup = ["pycryptodome", "javaobj-py3"]
all = ["pycryptodome", "javaobj-py3", "iphone_backup_decrypt @ git+https://github.com/KnugiHK/iphone_backup_decrypt"]
everything = ["pycryptodome", "javaobj-py3", "iphone_backup_decrypt @ git+https://github.com/KnugiHK/iphone_backup_decrypt"]
backup = ["pycryptodome", "javaobj-py3", "iphone_backup_decrypt @ git+https://github.com/KnugiHK/iphone_backup_decrypt"]
[project.scripts]
wtsexporter = "Whatsapp_Chat_Exporter.__main__:main"

27
tests/conftest.py Normal file
View File

@@ -0,0 +1,27 @@
import pytest
import os
def pytest_collection_modifyitems(config, items):
"""
Moves test_nuitka_binary.py to the end and fails if the file is missing.
"""
target_file = "test_nuitka_binary.py"
# Sanity Check: Ensure the file actually exists in the tests directory
test_dir = os.path.join(config.rootdir, "tests")
file_path = os.path.join(test_dir, target_file)
if not os.path.exists(file_path):
pytest.exit(f"\n[FATAL] Required test file '{target_file}' not found in {test_dir}. "
f"Order enforcement failed!", returncode=1)
nuitka_tests = []
remaining_tests = []
for item in items:
if target_file in item.nodeid:
nuitka_tests.append(item)
else:
remaining_tests.append(item)
items[:] = remaining_tests + nuitka_tests