diff --git a/Whatsapp_Chat_Exporter/android_handler.py b/Whatsapp_Chat_Exporter/android_handler.py index d371b4f..46e0511 100644 --- a/Whatsapp_Chat_Exporter/android_handler.py +++ b/Whatsapp_Chat_Exporter/android_handler.py @@ -13,7 +13,7 @@ from Whatsapp_Chat_Exporter.data_model import ChatStore, Message from Whatsapp_Chat_Exporter.utility import CLEAR_LINE, CURRENT_TZ_OFFSET, MAX_SIZE, ROW_SIZE, JidType, Device from Whatsapp_Chat_Exporter.utility import rendering, get_file_name, setup_template, get_cond_for_empty from Whatsapp_Chat_Exporter.utility import get_status_location, convert_time_unit, determine_metadata -from Whatsapp_Chat_Exporter.utility import get_chat_condition, slugify, bytes_to_readable +from Whatsapp_Chat_Exporter.utility import get_chat_condition, safe_name, bytes_to_readable logger = logging.getLogger(__name__) @@ -668,8 +668,8 @@ def _process_single_media(data, content, media_folder, mime, separate_media): # Copy media to separate folder if needed if separate_media: - chat_display_name = slugify(current_chat.name or message.sender - or content["key_remote_jid"].split('@')[0], True) + chat_display_name = safe_name(current_chat.name or message.sender + or content["key_remote_jid"].split('@')[0]) current_filename = file_path.split("/")[-1] new_folder = os.path.join(media_folder, "separated", chat_display_name) Path(new_folder).mkdir(parents=True, exist_ok=True) diff --git a/Whatsapp_Chat_Exporter/ios_handler.py b/Whatsapp_Chat_Exporter/ios_handler.py index 14a43e0..3c40202 100644 --- a/Whatsapp_Chat_Exporter/ios_handler.py +++ b/Whatsapp_Chat_Exporter/ios_handler.py @@ -9,7 +9,7 @@ from mimetypes import MimeTypes from markupsafe import escape as htmle from Whatsapp_Chat_Exporter.data_model import ChatStore, Message from Whatsapp_Chat_Exporter.utility import APPLE_TIME, CLEAR_LINE, CURRENT_TZ_OFFSET, get_chat_condition -from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, slugify, Device +from Whatsapp_Chat_Exporter.utility import bytes_to_readable, convert_time_unit, safe_name, Device logger = logging.getLogger(__name__) @@ -402,8 +402,8 @@ def process_media_item(content, data, media_folder, mime, separate_media): # Handle separate media option if separate_media: - chat_display_name = slugify( - current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0], True) + chat_display_name = safe_name( + current_chat.name or message.sender or content["ZCONTACTJID"].split('@')[0]) current_filename = file_path.split("/")[-1] new_folder = os.path.join(media_folder, "separated", chat_display_name) Path(new_folder).mkdir(parents=True, exist_ok=True) diff --git a/Whatsapp_Chat_Exporter/utility.py b/Whatsapp_Chat_Exporter/utility.py index e656ddc..9eda832 100644 --- a/Whatsapp_Chat_Exporter/utility.py +++ b/Whatsapp_Chat_Exporter/utility.py @@ -5,6 +5,7 @@ import json import os import unicodedata import re +import string import math import shutil from bleach import clean as sanitize @@ -12,7 +13,7 @@ from markupsafe import Markup from datetime import datetime, timedelta from enum import IntEnum from Whatsapp_Chat_Exporter.data_model import ChatCollection, ChatStore -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple, Union try: from enum import StrEnum, IntEnum except ImportError: @@ -600,26 +601,28 @@ def setup_template(template: Optional[str], no_avatar: bool, experimental: bool APPLE_TIME = 978307200 -def slugify(value: str, allow_unicode: bool = False) -> str: +def safe_name(text: Union[str|bytes]) -> str: """ - Convert text to ASCII-only slugs for URL-safe strings. - Taken from https://github.com/django/django/blob/master/django/utils/text.py + Sanitize the input text and generates a safe file name. + This function serves a similar purpose to slugify() from + Django previously used in this project, but is a clean-room + Reimplementation tailored for performance and a narrower + Use case for this project. Licensed under the same terms + As the project (MIT). Args: - value (str): The string to convert to a slug. - allow_unicode (bool, optional): Whether to allow Unicode characters. Defaults to False. + text (str|bytes): The string to be sanitized. Returns: - str: The slugified string with only alphanumerics, underscores, or hyphens. + str: The sanitized string with only alphanumerics, underscores, or hyphens. """ - value = str(value) - if allow_unicode: - value = unicodedata.normalize('NFKC', value) - else: - value = unicodedata.normalize('NFKD', value).encode( - 'ascii', 'ignore').decode('ascii') - value = re.sub(r'[^\w\s-]', '', value.lower()) - return re.sub(r'[-\s]+', '-', value).strip('-_') + if isinstance(text, bytes): + text = text.decode("utf-8", "ignore") + elif not isinstance(text, str): + raise TypeError("value must be a string or bytes") + normalized_text = unicodedata.normalize("NFKC", text) + safe_chars = [char for char in normalized_text if char.isalnum() or char in "-_ ."] + return "-".join(''.join(safe_chars).split()) class WhatsAppIdentifier(StrEnum): diff --git a/tests/test_utility.py b/tests/test_utility.py new file mode 100644 index 0000000..bd4bd23 --- /dev/null +++ b/tests/test_utility.py @@ -0,0 +1,43 @@ +import pytest +import random +import string + +from Whatsapp_Chat_Exporter.utility import safe_name + +def generate_random_string(length=50): + random.seed(10) + return ''.join(random.choice(string.ascii_letters + string.digits + "äöüß") for _ in range(length)) + + +# Test cases to validate the safe_name function +safe_name_test_cases = [ + ("This is a test string", "This-is-a-test-string"), + ("This is a test string with special characters!@#$%^&*()", "This-is-a-test-string-with-special-characters"), + ("This is a test string with numbers 1234567890", "This-is-a-test-string-with-numbers-1234567890"), + ("This is a test string with mixed case ThisIsATestString", "This-is-a-test-string-with-mixed-case-ThisIsATestString"), + ("This is a test string with extra spaces ThisIsATestString", "This-is-a-test-string-with-extra-spaces-ThisIsATestString"), + ("This is a test string with unicode characters äöüß", "This-is-a-test-string-with-unicode-characters-äöüß"), + ("這是一個包含中文的測試字符串", "這是一個包含中文的測試字符串"), # Chinese characters, should stay as is + ( + f"This is a test string with long length {generate_random_string(1000)}", + f"This-is-a-test-string-with-long-length-{generate_random_string(1000)}", + ), + ("", ""), # Empty string + (" ", ""), # String with only space + ("---", "---"), # String with only hyphens + ("___", "___"), # String with only underscores + ("a" * 100, "a" * 100), # Long string with single character + ("a-b-c-d-e", "a-b-c-d-e"), # String with hyphen + ("a_b_c_d_e", "a_b_c_d_e"), # String with underscore + ("a b c d e", "a-b-c-d-e"), # String with spaces + ("test.com/path/to/resource?param1=value1¶m2=value2", "test.compathtoresourceparam1value1param2value2"), # Test with URL + ("filename.txt", "filename.txt"), # Test with filename + ("Αυτή είναι μια δοκιμαστική συμβολοσειρά με ελληνικούς χαρακτήρες.", "Αυτή-είναι-μια-δοκιμαστική-συμβολοσειρά-με-ελληνικούς-χαρακτήρες."), # Greek characters + ("This is a test with комбинированные знаки ̆ example", "This-is-a-test-with-комбинированные-знаки-example") # Mixed with unicode +] + + +@pytest.mark.parametrize("input_text, expected_output", safe_name_test_cases) +def test_safe_name(input_text, expected_output): + result = safe_name(input_text) + assert result == expected_output \ No newline at end of file