From f840c4a8f1dde74c654f58eabca6329ac2953257 Mon Sep 17 00:00:00 2001 From: User Date: Sun, 28 Sep 2025 15:27:38 +0200 Subject: [PATCH] feat: add support for Cyrillic URLs and paths --- .DS_Store | Bin 0 -> 6148 bytes README.md | 14 +++++++++++++ wayback-machine-downloader/downloader.js | 24 +++++++++++++---------- wayback-machine-downloader/package.json | 2 +- 4 files changed, 29 insertions(+), 11 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..53efd372bfa05ff39d81579b005609ce2a9df5c1 GIT binary patch literal 6148 zcmeHKO>fgc5S>laHc^F?1E{E{mAIzRa3EB@m^3-`&`XU#2vD$Vi&}MetJuLpQ6yjC z5Aj2g`gh8k-3=lHI3R=&-HB%3e#{$Z-`ZX;5sASx=@E5^s0C+iv{C%Pc$|I38m?y@ zC`^u$ilbbg#<`AHqJ77|r~t3s21Qihm(ue3@lA0IKe#q#{(hTIXhctFh>;Q9rwdBH zS#f53{*JLj74pBQ809IQ(=Iq%JEM#guDMGIX~ZbXfe~6MO_pp$A>%KiT1`Mlk_bML{y$@o0K zQ2K+Bm;@dsW49bG;3F!(CFi?vqH?X?g1^YU$m!85;1#%$3bjAuJvJ(!~W1Ym1gn z!W}+@yRvX66k)E8{L+Mz2rT;2E8rDaR$$FGJG}poet!O6Ci#|Ez$@@yDIl7|a5%t} z+}^q}Io@j}{0*Fq<7$gvDX5sO7_qz+--R1PzvK!qa9CSJ56piE7#V!w75J+Pd;xHk Bh~@wQ literal 0 HcmV?d00001 diff --git a/README.md b/README.md index c6e24df..7f3ce30 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,20 @@ This webarchive website downloader has an interactive interface, supports downlo --- +## Table of Contents + +- [Features of Web Archive Website Downloader](#features-of-web-archive-website-downloader) + - [Special Features](#special-features) +- [Requirements](#requirements) +- [Installation](#installation) +- [Run](#run) +- [Example](#example) +- [Common Issues](#common-issues) +- [(Important) Download responsibly](#important-download-responsibly) +- [Contributing](#contributing) + +--- + ## Features of Web Archive Website Downloader 1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets. diff --git a/wayback-machine-downloader/downloader.js b/wayback-machine-downloader/downloader.js index d20a86b..590c9c9 100644 --- a/wayback-machine-downloader/downloader.js +++ b/wayback-machine-downloader/downloader.js @@ -1,11 +1,11 @@ /* - * Wayback Machine Downloader 0.1 by WhitelightSEO — Interactive (Node.js, ESM) + * Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM) * Run: node downloader.js */ import fs from "fs"; import path from "path"; -import { fileURLToPath, pathToFileURL } from "url"; +import { fileURLToPath, pathToFileURL, domainToUnicode } from "url"; import { mkdir } from "fs/promises"; import pLimit from "p-limit"; import { load } from "cheerio"; @@ -117,11 +117,12 @@ class WaybackMachineDownloader { try { if (this.base_url.includes("//")) { const u = new URL(this.base_url); - return u.host; + return domainToUnicode(u.host); // use human-readable domain } } catch {} return this.base_url; } + backup_path() { if (this.directory) { return this.directory.endsWith(path.sep) @@ -172,7 +173,7 @@ class WaybackMachineDownloader { const url = pair[1]; try { const u = new URL(url); - const file_id = u.pathname; + const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths const prev = curated.get(file_id); if (!prev || prev.timestamp <= ts) { curated.set(file_id, { file_url: url, timestamp: ts, file_id }); @@ -190,6 +191,7 @@ class WaybackMachineDownloader { "%" + s.charCodeAt(0).toString(16) ); } + async _structure_dir_path(dir_path) { try { await mkdir(dir_path, { recursive: true }); @@ -265,7 +267,7 @@ class WaybackMachineDownloader { let html = fs.readFileSync(htmlPath, "utf8"); const $ = load(html); const site = new URL(this.base_url); - const siteHost = site.hostname.replace(/^www\./, ""); + const siteHost = domainToUnicode(site.hostname.replace(/^www\./, "")); const baseDir = path.dirname(htmlPath); const downloadTasks = []; @@ -281,16 +283,17 @@ class WaybackMachineDownloader { try { const abs = new URL(val, pageUrl).toString(); const u = new URL(abs); - const isInternal = u.hostname.replace(/^www\./, "") === siteHost; + const isInternal = + domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost; if (isInternal || this.download_external_assets) { - const file_id = u.pathname; + const file_id = decodeURIComponent(u.pathname); const paths = this._determine_paths(abs, file_id); if (!paths) return; const { dir_path, file_path } = paths; if (this.rewrite_links) { - const normPath = u.pathname + (u.hash || ""); + const normPath = decodeURIComponent(u.pathname) + (u.hash || ""); const localTarget = ensureLocalTargetForPath(normPath); const localAbsPath = path.join(backupRoot, localTarget); $(el).attr(attr, relativeLink(baseDir, localAbsPath)); @@ -315,10 +318,11 @@ class WaybackMachineDownloader { try { const abs = new URL(val, pageUrl).toString(); const u = new URL(abs); - const isInternal = u.hostname.replace(/^www\./, "") === siteHost; + const isInternal = + domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost; if (isInternal) { - const normPath = u.pathname + (u.hash || ""); + const normPath = decodeURIComponent(u.pathname) + (u.hash || ""); const localTarget = ensureLocalTargetForPath(normPath); const localAbsPath = path.join(backupRoot, localTarget); $(el).attr(attr, relativeLink(baseDir, localAbsPath)); diff --git a/wayback-machine-downloader/package.json b/wayback-machine-downloader/package.json index 144a2b6..c076d8d 100644 --- a/wayback-machine-downloader/package.json +++ b/wayback-machine-downloader/package.json @@ -1,6 +1,6 @@ { "name": "wayback-downloader", - "version": "0.1.0", + "version": "0.2.0", "description": "Interactive Wayback Machine downloader for archiving websites locally.", "type": "module", "main": "downloader.js",