diff --git a/README.md b/README.md index 86b5ae5..163ce10 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ A customizable Android and iPhone Whatsapp database parser that will give you th First, install the exporter by: ```shell pip install whatsapp-chat-exporter +pip install whatsapp-chat-exporter[android_backup] & :: Optional, if you want it to support decrypting Android WhatsApp backup. ``` Then, create a working directory in somewhere you want ```shell @@ -56,7 +57,6 @@ Do an iPhone Backup with iTunes first. If you want to work on an encrypted iPhone Backup, you should install iphone_backup_decrypt from [KnugiHK/iphone_backup_decrypt](https://github.com/KnugiHK/iphone_backup_decrypt) before you run the extract_iphone_media.py. ```sh -pip install biplist pycryptodome & :: Optional, since the pip will install these dependencies automatically. pip install git+https://github.com/KnugiHK/iphone_backup_decrypt ``` ### Extracting @@ -88,13 +88,15 @@ Options: -m MEDIA, --media=MEDIA Path to WhatsApp media folder -b BACKUP, --backup=BACKUP - Path to iPhone/Android (must be used together with -k) + Path to Android (must be used together with -k)/iPhone WhatsApp backup -o OUTPUT, --output=OUTPUT Output to specific directory -j, --json Save the result to a single JSON file -d DB, --db=DB Path to database file -k KEY, --key=KEY Path to key file + -t TEMPLATE, --template=TEMPLATE + Path to custom HTML template ``` # To do diff --git a/Whatsapp_Chat_Exporter/__init__.py b/Whatsapp_Chat_Exporter/__init__.py index a9e6ff3..4f20502 100644 --- a/Whatsapp_Chat_Exporter/__init__.py +++ b/Whatsapp_Chat_Exporter/__init__.py @@ -1 +1 @@ -__version__ = "0.6" \ No newline at end of file +__version__ = "0.6" diff --git a/Whatsapp_Chat_Exporter/__main__.py b/Whatsapp_Chat_Exporter/__main__.py index ee917ca..47b3b97 100644 --- a/Whatsapp_Chat_Exporter/__main__.py +++ b/Whatsapp_Chat_Exporter/__main__.py @@ -1,5 +1,6 @@ from .__init__ import __version__ -from Whatsapp_Chat_Exporter import extract, extract_iphone, extract_iphone_media +from Whatsapp_Chat_Exporter import extract, extract_iphone +from Whatsapp_Chat_Exporter import extract_iphone_media from optparse import OptionParser import os import sqlite3 @@ -40,7 +41,8 @@ def main(): "--backup", dest="backup", default=None, - help="Path to Android (must be used together with -k)/iPhone WhatsApp backup") + help="Path to Android (must be used together " + "with -k)/iPhone WhatsApp backup") parser.add_option( "-o", "--output", @@ -67,6 +69,12 @@ def main(): default=None, help="Path to key file" ) + parser.add_option( + "-t", + "--template", + dest="template", + default=None, + help="Path to custom HTML template") (options, args) = parser.parse_args() if options.android and options.iphone: @@ -94,8 +102,10 @@ def main(): print("Decryption key specified, decrypting WhatsApp backup...") key = open(options.key, "rb").read() db = open(options.backup, "rb").read() - if not extract.decrypt_backup(db, key, msg_db): - print("Dependencies of decrypt_backup are not present. For details, see README.md") + is_crypt14 = False if "crypt12" in options.backup else True + if not extract.decrypt_backup(db, key, msg_db, is_crypt14): + print("Dependencies of decrypt_backup are not " + "present. For details, see README.md") return False if options.wa is None: contact_db = "wa.db" @@ -103,14 +113,14 @@ def main(): contact_db = options.wa if options.media is None: options.media = "WhatsApp" - + if len(args) == 1: msg_db = args[0] if os.path.isfile(contact_db): with sqlite3.connect(contact_db) as db: contacts(db, data) - + elif options.iphone: messages = extract_iphone.messages media = extract_iphone.media @@ -137,7 +147,7 @@ def main(): messages(db, data) media(db, data, options.media) vcard(db, data) - create_html(data, options.output) + create_html(data, options.output, options.template) if not os.path.isdir(f"{options.output}/{options.media}"): shutil.move(options.media, f"{options.output}/") diff --git a/Whatsapp_Chat_Exporter/extract.py b/Whatsapp_Chat_Exporter/extract.py index 877fdb8..48e1cea 100644 --- a/Whatsapp_Chat_Exporter/extract.py +++ b/Whatsapp_Chat_Exporter/extract.py @@ -8,6 +8,9 @@ import requests import shutil import re import pkgutil +from pathlib import Path +from bleach import clean as sanitize +from markupsafe import Markup from datetime import datetime from mimetypes import MimeTypes try: @@ -19,6 +22,10 @@ else: support_backup = True +def sanitize_except(html): + return Markup(sanitize(html, tags=["br"])) + + def determine_day(last, current): last = datetime.fromtimestamp(last).date() current = datetime.fromtimestamp(current).date() @@ -28,20 +35,27 @@ def determine_day(last, current): return current -def decrypt_backup(database, key, output): +def decrypt_backup(database, key, output, crypt14=True): if not support_backup: return False if len(key) != 158: raise ValueError("The key file must be 158 bytes") - if len(database) < 191: - raise ValueError("The database file must be at least 191 bytes") t1 = key[30:62] - t2 = database[15:47] + if crypt14: + if len(database) < 191: + raise ValueError("The crypt14 file must be at least 191 bytes") + t2 = database[15:47] + iv = database[67:83] + db_ciphertext = database[191:] + else: + if len(database) < 67: + raise ValueError("The crypt12 file must be at least 67 bytes") + t2 = database[3:35] + iv = database[51:67] + db_ciphertext = database[67:-20] if t1 != t2: raise ValueError("The signature of key file and backup file mismatch") - iv = database[67:83] - db_ciphertext = database[191:] main_key = key[126:] cipher = AES.new(main_key, AES.MODE_GCM, iv) db_compressed = cipher.decrypt(db_ciphertext) @@ -105,7 +119,9 @@ def messages(db, data): "timestamp": content[3]/1000, "time": datetime.fromtimestamp(content[3]/1000).strftime("%H:%M"), "media": False, - "key_id": content[13] + "key_id": content[13], + "meta": False, + "data": None } if "-" in content[0] and content[2] == 0: name = None @@ -140,8 +156,9 @@ def messages(db, data): try: int(content[4]) except ValueError: - msg = "{The group name changed to "f"{content[4]}"" }" + msg = f"The group name changed to {content[4]}" data[content[0]]["messages"][content[1]]["data"] = msg + data[content[0]]["messages"][content[1]]["meta"] = True else: del data[content[0]]["messages"][content[1]] else: @@ -160,15 +177,16 @@ def messages(db, data): name_left = data[content[8]]["name"] else: name_left = content[8].split('@')[0] - msg = "{"f"{name_left}"f" added {name_right or 'You'}""}" + msg = f"{name_left} added {name_right or 'You'}" else: - msg = "{"f"Added {name_right or 'You'}""}" + msg = f"Added {name_right or 'You'}" elif b"\xac\xed\x00\x05\x74\x00" in thumb_image: # Changed number original = content[8].split('@')[0] changed = thumb_image[7:].decode().split('@')[0] - msg = "{"f"{original} changed to {changed}""}" + msg = f"{original} changed to {changed}" data[content[0]]["messages"][content[1]]["data"] = msg + data[content[0]]["messages"][content[1]]["meta"] = True else: if content[4] is None: del data[content[0]]["messages"][content[1]] @@ -180,20 +198,34 @@ def messages(db, data): else: if content[2] == 1: if content[5] == 5 and content[6] == 7: - msg = "{Message deleted}" + msg = "Message deleted" + data[content[0]]["messages"][content[1]]["meta"] = True else: if content[9] == "5": - msg = "{ Location shared: "f"{content[10], content[11]}"" }" + msg = f"Location shared: {content[10], content[11]}" + data[content[0]]["messages"][content[1]]["meta"] = True else: msg = content[4] + if msg is not None: + if "\r\n" in msg: + msg = msg.replace("\r\n", "
") + if "\n" in msg: + msg = msg.replace("\n", "
") else: if content[5] == 0 and content[6] == 7: - msg = "{Message deleted}" + msg = "Message deleted" + data[content[0]]["messages"][content[1]]["meta"] = True else: if content[9] == "5": - msg = "{ Location shared: "f"{content[10], content[11]}"" }" + msg = f"Location shared: {content[10], content[11]}" + data[content[0]]["messages"][content[1]]["meta"] = True else: msg = content[4] + if msg is not None: + if "\r\n" in msg: + msg = msg.replace("\r\n", "
") + if "\n" in msg: + msg = msg.replace("\n", "
") data[content[0]]["messages"][content[1]]["data"] = msg @@ -201,8 +233,7 @@ def messages(db, data): if i % 1000 == 0: print(f"Gathering messages...({i}/{total_row_number})", end="\r") content = c.fetchone() - print( - f"Gathering messages...({total_row_number}/{total_row_number})", end="\r") + print(f"Gathering messages...({total_row_number}/{total_row_number})", end="\r") def media(db, data, media_folder): @@ -248,8 +279,9 @@ def media(db, data, media_folder): # data[content[0]]["messages"][content[1]]["media"] = True # data[content[0]]["messages"][content[1]]["mime"] = "media" # else: - data[content[0]]["messages"][content[1]]["data"] = "{The media is missing}" + data[content[0]]["messages"][content[1]]["data"] = "The media is missing" data[content[0]]["messages"][content[1]]["mime"] = "media" + data[content[0]]["messages"][content[1]]["meta"] = True i += 1 if i % 100 == 0: print(f"Gathering media...({i}/{total_row_number})", end="\r") @@ -272,27 +304,34 @@ def vcard(db, data): total_row_number = len(rows) print(f"\nGathering vCards...(0/{total_row_number})", end="\r") base = "WhatsApp/vCards" + if not os.path.isdir(base): + Path(base).mkdir(parents=True, exist_ok=True) for index, row in enumerate(rows): - if not os.path.isdir(base): - os.mkdir(base) file_name = "".join(x for x in row[3] if x.isalnum()) file_path = f"{base}/{file_name}.vcf" if not os.path.isfile(file_path): with open(file_path, "w", encoding="utf-8") as f: f.write(row[2]) data[row[1]]["messages"][row[0]]["data"] = row[3] + \ - "{ The vCard file cannot be displayed here, however it " \ - "should be located at " + file_path + "}" + "The vCard file cannot be displayed here, " \ + f"however it should be located at {file_path}" data[row[1]]["messages"][row[0]]["mime"] = "text/x-vcard" + data[row[1]]["messages"][row[0]]["meta"] = True print(f"Gathering vCards...({index + 1}/{total_row_number})", end="\r") -def create_html(data, output_folder): - templateLoader = jinja2.FileSystemLoader(searchpath=os.path.dirname(__file__)) +def create_html(data, output_folder, template=None): + if template is None: + template_dir = os.path.dirname(__file__) + template_file = "whatsapp.html" + else: + template_dir = os.path.dirname(template) + template_file = os.path.basename(template) + templateLoader = jinja2.FileSystemLoader(searchpath=template_dir) templateEnv = jinja2.Environment(loader=templateLoader) templateEnv.globals.update(determine_day=determine_day) - TEMPLATE_FILE = "whatsapp.html" - template = templateEnv.get_template(TEMPLATE_FILE) + templateEnv.filters['sanitize_except'] = sanitize_except + template = templateEnv.get_template(template_file) total_row_number = len(data) print(f"\nCreating HTML...(0/{total_row_number})", end="\r") diff --git a/Whatsapp_Chat_Exporter/extract_iphone.py b/Whatsapp_Chat_Exporter/extract_iphone.py index cdb96ae..66bbfc7 100644 --- a/Whatsapp_Chat_Exporter/extract_iphone.py +++ b/Whatsapp_Chat_Exporter/extract_iphone.py @@ -7,12 +7,19 @@ import os import requests import shutil import pkgutil +from pathlib import Path +from bleach import clean as sanitize +from markupsafe import Markup from datetime import datetime from mimetypes import MimeTypes APPLE_TIME = datetime.timestamp(datetime(2001, 1, 1)) +def sanitize_except(html): + return Markup(sanitize(html, tags=["br"])) + + def determine_day(last, current): last = datetime.fromtimestamp(last).date() current = datetime.fromtimestamp(current).date() @@ -62,7 +69,9 @@ def messages(db, data): "time": datetime.fromtimestamp(ts).strftime("%H:%M"), "media": False, "reply": None, - "caption": None + "caption": None, + "meta": False, + "data": None } if "-" in content[0] and content[2] == 0: name = None @@ -87,8 +96,9 @@ def messages(db, data): try: int(content[4]) except ValueError: - msg = "{The group name changed to "f"{content[4]}"" }" + msg = f"The group name changed to {content[4]}" data[content[0]]["messages"][content[1]]["data"] = msg + data[content[0]]["messages"][content[1]]["meta"] = True else: del data[content[0]]["messages"][content[1]] else: @@ -99,14 +109,26 @@ def messages(db, data): # real message if content[2] == 1: if content[5] == 14: - msg = "{Message deleted}" + msg = "Message deleted" + data[content[0]]["messages"][content[1]]["meta"] = True else: msg = content[4] + if msg is not None: + if "\r\n" in msg: + msg = msg.replace("\r\n", "
") + if "\n" in msg: + msg = msg.replace("\n", "
") else: if content[5] == 14: - msg = "{Message deleted}" + msg = "Message deleted" + data[content[0]]["messages"][content[1]]["meta"] = True else: msg = content[4] + if msg is not None: + if "\r\n" in msg: + msg = msg.replace("\r\n", "
") + if "\n" in msg: + msg = msg.replace("\n", "
") data[content[0]]["messages"][content[1]]["data"] = msg i += 1 if i % 1000 == 0: @@ -138,7 +160,7 @@ def media(db, data, media_folder): content = c.fetchone() mime = MimeTypes() while content is not None: - file_path = f"Message/{content[2]}" + file_path = f"{media_folder}/{content[2]}" data[content[0]]["messages"][content[1]]["media"] = True if os.path.isfile(file_path): @@ -161,8 +183,9 @@ def media(db, data, media_folder): # data[content[0]]["messages"][content[1]]["data"] = "{The media is missing}" # data[content[0]]["messages"][content[1]]["mime"] = "media" # else: - data[content[0]]["messages"][content[1]]["data"] = "{The media is missing}" + data[content[0]]["messages"][content[1]]["data"] = "The media is missing" data[content[0]]["messages"][content[1]]["mime"] = "media" + data[content[0]]["messages"][content[1]]["meta"] = True if content[6] is not None: data[content[0]]["messages"][content[1]]["caption"] = content[6] i += 1 @@ -190,29 +213,36 @@ def vcard(db, data): total_row_number = len(rows) print(f"\nGathering vCards...(0/{total_row_number})", end="\r") base = "Message/vCards" + if not os.path.isdir(base): + Path(base).mkdir(parents=True, exist_ok=True) for index, row in enumerate(rows): - if not os.path.isdir(base): - os.mkdir(base) file_name = "".join(x for x in row[3] if x.isalnum()) file_path = f"{base}/{file_name[:200]}.vcf" if not os.path.isfile(file_path): with open(file_path, "w", encoding="utf-8") as f: f.write(row[4]) data[row[2]]["messages"][row[1]]["data"] = row[3] + \ - "{ The vCard file cannot be displayed here, however it " \ - "should be located at " + file_path + "}" + "The vCard file cannot be displayed here, " \ + f"however it should be located at {file_path}" data[row[2]]["messages"][row[1]]["mime"] = "text/x-vcard" data[row[2]]["messages"][row[1]]["media"] = True + data[row[2]]["messages"][row[1]]["meta"] = True print(f"Gathering vCards...({index + 1}/{total_row_number})", end="\r") -def create_html(data, output_folder): - templateLoader = jinja2.FileSystemLoader(searchpath=os.path.dirname(__file__)) +def create_html(data, output_folder, template=None): + if template is None: + template_dir = os.path.dirname(__file__) + template_file = "whatsapp.html" + else: + template_dir = os.path.dirname(template) + template_file = os.path.basename(template) + templateLoader = jinja2.FileSystemLoader(searchpath=template_dir) templateEnv = jinja2.Environment(loader=templateLoader) templateEnv.globals.update(determine_day=determine_day) - TEMPLATE_FILE = "whatsapp.html" - template = templateEnv.get_template(TEMPLATE_FILE) - + templateEnv.filters['sanitize_except'] = sanitize_except + template = templateEnv.get_template(template_file) + total_row_number = len(data) print(f"\nCreating HTML...(0/{total_row_number})", end="\r") diff --git a/Whatsapp_Chat_Exporter/extract_iphone_media.py b/Whatsapp_Chat_Exporter/extract_iphone_media.py index 2ce21ca..83b1576 100644 --- a/Whatsapp_Chat_Exporter/extract_iphone_media.py +++ b/Whatsapp_Chat_Exporter/extract_iphone_media.py @@ -6,21 +6,24 @@ import os import getpass try: from iphone_backup_decrypt import EncryptedBackup, RelativePath -except: +except ModuleNotFoundError: support_encrypted = False else: support_encrypted = True + def extract_encrypted(base_dir, password): backup = EncryptedBackup(backup_directory=base_dir, passphrase=password) print("Decrypting WhatsApp database...") - backup.extract_file(relative_path=RelativePath.WHATSAPP_MESSAGES, output_filename="7c7fba66680ef796b916b067077cc246adacf01d") - backup.extract_file(relative_path=RelativePath.WHATSAPP_CONTACTS, output_filename="ContactsV2.sqlite") + backup.extract_file(relative_path=RelativePath.WHATSAPP_MESSAGES, + output_filename="7c7fba66680ef796b916b067077cc246adacf01d") + backup.extract_file(relative_path=RelativePath.WHATSAPP_CONTACTS, + output_filename="ContactsV2.sqlite") data = backup.execute_sql("""SELECT count() FROM Files WHERE relativePath LIKE 'Message/Media/%'""" - ) + ) total_row_number = data[0][0] print(f"Gathering media...(0/{total_row_number})", end="\r") data = backup.execute_sql("""SELECT fileID, @@ -30,7 +33,7 @@ def extract_encrypted(base_dir, password): FROM Files WHERE relativePath LIKE 'Message/Media/%'""" - ) + ) if not os.path.isdir("Message"): os.mkdir("Message") if not os.path.isdir("Message/Media"): @@ -43,7 +46,7 @@ def extract_encrypted(base_dir, password): flags = row[2] file = row[3] if flags == 2: - try: + try: os.mkdir(destination) except FileExistsError: pass @@ -56,6 +59,7 @@ def extract_encrypted(base_dir, password): print(f"Gathering media...({i}/{total_row_number})", end="\r") print(f"Gathering media...({total_row_number}/{total_row_number})", end="\r") + def is_encrypted(base_dir): with sqlite3.connect(f"{base_dir}/Manifest.db") as f: c = f.cursor() @@ -68,6 +72,7 @@ def is_encrypted(base_dir): else: return False + def extract_media(base_dir): if is_encrypted(base_dir): if not support_encrypted: @@ -81,7 +86,7 @@ def extract_media(base_dir): wts_db = os.path.join(base_dir, "7c/7c7fba66680ef796b916b067077cc246adacf01d") if not os.path.isfile(wts_db): print("WhatsApp database not found.") - sys.exit(1) + exit() else: shutil.copyfile(wts_db, "7c7fba66680ef796b916b067077cc246adacf01d") with sqlite3.connect(f"{base_dir}/Manifest.db") as manifest: diff --git a/Whatsapp_Chat_Exporter/whatsapp.html b/Whatsapp_Chat_Exporter/whatsapp.html index 7bea23b..d653ce5 100644 --- a/Whatsapp_Chat_Exporter/whatsapp.html +++ b/Whatsapp_Chat_Exporter/whatsapp.html @@ -72,29 +72,37 @@ "{{ msg.quoted_data or 'media' }}" {% endif %} - {% if msg.media == false %} - {% filter escape %}{{ msg.data or "{This message is not supported yet}" | replace('\n', '
') }}{% endfilter %} + {% if msg.meta == true or msg.media == false and msg.data is none %} +
+

{{ msg.data or 'This message is not supported' }}

+
{% else %} - {% if "image/" in msg.mime %} - - {% elif "audio/" in msg.mime %} - - {% elif "video/" in msg.mime %} - - {% elif "/" in msg.mime %} - {The file cannot be displayed here, however it should be located at {{ msg.data }}} + {% if msg.media == false %} + {{ msg.data | sanitize_except() }} {% else %} - {% filter escape %}{{ msg.data }}{% endfilter %} + {% if "image/" in msg.mime %} + + {% elif "audio/" in msg.mime %} + + {% elif "video/" in msg.mime %} + + {% elif "/" in msg.mime %} +
+

The file cannot be displayed here, however it should be located at {{ msg.data }}

+
+ {% else %} + {% filter escape %}{{ msg.data }}{% endfilter %} + {% endif %} + {% if msg.caption is not none %} +
+ {{ msg.caption }} + {% endif %} {% endif %} - {% if msg.caption is not none %} -
- {{ msg.caption }} {% endif %} - {% endif %}
@@ -120,27 +128,35 @@ "{{ msg.quoted_data or 'media' }}" {% endif %} - {% if msg.media == false %} - {% filter escape %}{{ msg.data or "{This message is not supported yet}" }}{% endfilter %} + {% if msg.meta == true or msg.media == false and msg.data is none %} +
+

{{ msg.data or 'This message is not supported' }}

+
{% else %} - {% if "image/" in msg.mime %} - - {% elif "audio/" in msg.mime %} - - {% elif "video/" in msg.mime %} - - {% elif "/" in msg.mime %} - {The file cannot be displayed here, however it should be located at {{ msg.data }}} + {% if msg.media == false %} + {{ msg.data | sanitize_except() }} {% else %} - {% filter escape %}{{ msg.data }}{% endfilter %} - {% endif %} - {% if msg.caption is not none %} -
- {{ msg.caption }} + {% if "image/" in msg.mime %} + + {% elif "audio/" in msg.mime %} + + {% elif "video/" in msg.mime %} + + {% elif "/" in msg.mime %} +
+

The file cannot be displayed here, however it should be located at {{ msg.data }}

+
+ {% else %} + {% filter escape %}{{ msg.data }}{% endfilter %} + {% endif %} + {% if msg.caption is not none %} +
+ {{ msg.caption }} + {% endif %} {% endif %} {% endif %} diff --git a/setup.py b/setup.py index 46a80a9..6633374 100644 --- a/setup.py +++ b/setup.py @@ -12,12 +12,13 @@ setuptools.setup( version=version, author="KnugiHK", author_email="info@knugi.com", - description="A Whatsapp database parser that will give you the history of your Whatsapp conversations in HTML and JSON.", + description="A Whatsapp database parser that will give you the " + "history of your Whatsapp conversations in HTML and JSON.", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/KnugiHK/Whatsapp-Chat-Exporter", packages=setuptools.find_packages(), - package_data = { + package_data={ '': ['whatsapp.html'] }, classifiers=[ @@ -36,9 +37,10 @@ setuptools.setup( ], python_requires='>=3.7', install_requires=[ - 'jinja2' + 'jinja2', + 'bleach' ], - extras_require = { + extras_require={ 'android_backup': ["pycryptodome"] }, entry_points={ @@ -46,4 +48,4 @@ setuptools.setup( "wtsexporter = Whatsapp_Chat_Exporter.__main__:main" ] } -) \ No newline at end of file +)