Files
ak2021-Jahrbuch/htmltomarkdown.py
2026-01-14 21:15:15 +01:00

66 lines
2.7 KiB
Python

import os
from markdownify import markdownify as md
def convert_all_html_in_folder(source_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
markdown_files_list = []
for root, dirs, files in os.walk(source_folder):
for file in files:
if file.endswith(".html") or file.endswith(".htm"):
html_path = os.path.join(root, file)
relative_path = os.path.relpath(html_path, source_folder)
md_filename_rel = os.path.splitext(relative_path)[0] + ".md"
md_path = os.path.join(output_folder, md_filename_rel)
os.makedirs(os.path.dirname(md_path), exist_ok=True)
html_content = None
# Versuch 1: UTF-8 (Standard)
try:
with open(html_path, 'r', encoding='utf-8') as f:
html_content = f.read()
except UnicodeDecodeError:
# Versuch 2: Latin-1 (für die Umlaute in deinen Dateien)
try:
with open(html_path, 'r', encoding='latin-1') as f:
html_content = f.read()
except Exception as e:
print(f"Kritischer Fehler bei {html_path}: {e}")
if html_content:
try:
markdown_text = md(html_content, heading_style="ATX")
with open(md_path, 'w', encoding='utf-8') as f:
f.write(markdown_text)
markdown_files_list.append(md_filename_rel)
print(f"Erfolgreich: {html_path}")
except Exception as e:
print(f"Konvertierungsfehler bei {html_path}: {e}")
create_readme(output_folder, markdown_files_list)
def create_readme(folder, file_list):
readme_path = os.path.join(folder, "README.md")
file_list.sort()
with open(readme_path, 'w', encoding='utf-8') as f:
f.write("# Inhaltsverzeichnis der konvertierten Dateien\n\n")
f.write("Hier sind alle konvertierten Markdown-Dateien aufgelistet:\n\n")
for file_rel_path in file_list:
display_name = os.path.basename(file_rel_path)
f.write(f"* [{display_name}]({file_rel_path})\n")
print(f"\nREADME.md wurde erstellt unter: {readme_path}")
# --- Deine Einstellungen ---
input_dir = "/Users/calvin/Downloads/ak-21.de/Jahrbuch Umfragen"
output_dir = "/Users/calvin/Downloads/ak-21.de/markdown"
if __name__ == "__main__":
convert_all_html_in_folder(input_dir, output_dir)
print("\nFertig! Die Umlaute sollten jetzt passen. Gruß, Schnitzel.")