From a6f5ee5e1c4f31760204e45bf4f8bb25c4bfeaa0 Mon Sep 17 00:00:00 2001 From: User Date: Mon, 29 Sep 2025 10:27:38 +0200 Subject: [PATCH] fix: prevent crashes on invalid paths and silent CDX JSON parse errors --- README.md | 6 +- wayback-machine-downloader/downloader.js | 197 ++++++++++------------- wayback-machine-downloader/package.json | 2 +- 3 files changed, 95 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index 7f3ce30..ccf730f 100644 --- a/README.md +++ b/README.md @@ -50,8 +50,12 @@ This webarchive website downloader has an interactive interface, supports downlo ```bash git clone https://github.com/birbwatcher/wayback-machine-downloader.git +``` +go to inner folder "wayback-machine-downloader" +```bash cd wayback-machine-downloader - +``` +```bash # Install dependencies npm install ``` diff --git a/wayback-machine-downloader/downloader.js b/wayback-machine-downloader/downloader.js index 590c9c9..ca81597 100644 --- a/wayback-machine-downloader/downloader.js +++ b/wayback-machine-downloader/downloader.js @@ -1,5 +1,5 @@ /* - * Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM) + * Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM) * Run: node downloader.js */ @@ -21,9 +21,7 @@ function renderProgress(current, total) { const ratio = total > 0 ? current / total : 0; const filled = Math.round(ratio * width); const bar = "█".repeat(filled) + "-".repeat(width - filled); - process.stdout.write( - `\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})` - ); + process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`); if (current === total) process.stdout.write("\n"); } @@ -47,17 +45,12 @@ function isHtmlFile(filePath, contentType, firstBytes) { const ext = path.extname(filePath).toLowerCase(); if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true; const head = (firstBytes || "").toString("utf8", 0, 512); - return /]/i.test(head); + return /]/i.test(head); } + // ----------------------------- Archive API ----------------------------- -async function getRawListFromApi({ - baseUrl, - pageIndex, - all, - fromTimestamp, - toTimestamp, -}) { +async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) { const cdx = new URL("https://web.archive.org/cdx/search/xd"); const params = new URLSearchParams(); params.set("output", "json"); @@ -66,43 +59,41 @@ async function getRawListFromApi({ params.set("collapse", "digest"); params.set("gzip", "false"); if (!all) params.append("filter", "statuscode:200"); - if (fromTimestamp && Number(fromTimestamp) !== 0) - params.set("from", String(fromTimestamp)); - if (toTimestamp && Number(toTimestamp) !== 0) - params.set("to", String(toTimestamp)); + if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp)); + if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp)); if (pageIndex != null) params.set("page", String(pageIndex)); cdx.search = params.toString(); try { const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" }); const text = await res.text(); - const json = JSON.parse(text); - if ( - Array.isArray(json) && - Array.isArray(json[0]) && - json[0].join(",") === "timestamp,original" - ) { + let json = []; + try { + json = JSON.parse(text); + } catch { + // silent: treat as empty page + return []; + } + if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") { json.shift(); } return json || []; - } catch (e) { - console.log(`ERROR getRawListFromApi: ${e}`); + } catch { + // silent: skip broken page return []; } } + // ----------------------------- DOWNLOADER CLASS ----------------------------- class WaybackMachineDownloader { constructor(params) { this.base_url = params.base_url; this.exact_url = !!params.exact_url; this.directory = params.directory || null; - this.from_timestamp = params.from_timestamp - ? Number(params.from_timestamp) - : 0; + this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0; this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0; - this.threads_count = - params.threads_count != null ? Number(params.threads_count) : 3; + this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3; this.download_external_assets = params.download_external_assets || false; @@ -113,49 +104,38 @@ class WaybackMachineDownloader { this._processed = 0; } + // Create a human-readable backup folder name, preserving IDNs backup_name() { try { if (this.base_url.includes("//")) { const u = new URL(this.base_url); - return domainToUnicode(u.host); // use human-readable domain + return domainToUnicode(u.host); } } catch {} return this.base_url; } + // Resolve output directory backup_path() { if (this.directory) { - return this.directory.endsWith(path.sep) - ? this.directory - : this.directory + path.sep; + return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep; } return path.join("websites", this.backup_name(), path.sep); } + // Fetch and merge snapshot lists async get_all_snapshots_to_consider() { console.log("Getting snapshot pages"); - const httpOpts = { - all: true, - fromTimestamp: this.from_timestamp, - toTimestamp: this.to_timestamp, - }; + const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp }; let list = []; - list = list.concat( - await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }) - ); + list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })); process.stdout.write("."); if (!this.exact_url) { - const wildcard = this.base_url.endsWith("/*") - ? this.base_url - : this.base_url.replace(/\/*$/, "") + "/*"; + const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*"; for (let i = 0; i < 100; i++) { - const batch = await getRawListFromApi({ - baseUrl: wildcard, - pageIndex: i, - ...httpOpts, - }); + const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts }); if (!batch || batch.length === 0) break; list = list.concat(batch); process.stdout.write("."); @@ -165,12 +145,15 @@ class WaybackMachineDownloader { return list; } + + // Choose the latest timestamp per unique pathname async get_file_list_by_timestamp() { const curated = new Map(); const all = await this.get_all_snapshots_to_consider(); for (const pair of all) { - const ts = pair[0]; - const url = pair[1]; + const ts = pair && pair[0]; + const url = pair && pair[1]; + if (!ts || !url) continue; try { const u = new URL(url); const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths @@ -185,13 +168,13 @@ class WaybackMachineDownloader { return arr; } + // Replace Windows-hostile characters when running on Windows _windowsSanitize(p) { if (process.platform !== "win32") return p; - return p.replace(/[:*?&=<>\\|]/g, (s) => - "%" + s.charCodeAt(0).toString(16) - ); + return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16)); } + // Ensure directory exists async _structure_dir_path(dir_path) { try { await mkdir(dir_path, { recursive: true }); @@ -200,9 +183,10 @@ class WaybackMachineDownloader { } } + // Compute local file paths for a given archived URL _determine_paths(file_url, file_id) { - if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) - return null; + if (!file_url || !file_id) return null; + if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null; if (file_id.length > 200) return null; const backup = this.backup_path(); @@ -212,15 +196,15 @@ class WaybackMachineDownloader { if (file_id === "") { dir_path = backup; file_path = path.join(backup, "index.html"); - } else if ( - file_url.endsWith("/") || - !parts[parts.length - 1].includes(".") - ) { - dir_path = path.join(backup, ...parts); - file_path = path.join(dir_path, "index.html"); } else { - dir_path = path.join(backup, ...parts.slice(0, -1)); - file_path = path.join(backup, ...parts); + const lastPart = parts[parts.length - 1] || ""; + if (file_url.endsWith("/") || !lastPart.includes(".")) { + dir_path = path.join(backup, ...parts); + file_path = path.join(dir_path, "index.html"); + } else { + dir_path = path.join(backup, ...parts.slice(0, -1)); + file_path = path.join(backup, ...parts); + } } dir_path = this._windowsSanitize(dir_path); @@ -229,6 +213,8 @@ class WaybackMachineDownloader { return { dir_path, file_path }; } + + // Download a single asset (img/css/js/etc.) referenced from an HTML page async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) { try { if (fs.existsSync(file_path)) return file_path; @@ -261,21 +247,20 @@ class WaybackMachineDownloader { } } + // Parse saved HTML, optionally rewrite internal links to relative and fetch assets async _process_html_assets(htmlPath, pageUrl, pageTimestamp) { try { const backupRoot = this.backup_path(); let html = fs.readFileSync(htmlPath, "utf8"); - const $ = load(html); + const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is const site = new URL(this.base_url); - const siteHost = domainToUnicode(site.hostname.replace(/^www\./, "")); + const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, "")); const baseDir = path.dirname(htmlPath); const downloadTasks = []; // ----------- ASSETS ----------- - $( - "img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]" - ).each((_, el) => { + $("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => { const attr = el.tagName === "link" ? "href" : "src"; const val = $(el).attr(attr); if (!val) return; @@ -283,12 +268,17 @@ class WaybackMachineDownloader { try { const abs = new URL(val, pageUrl).toString(); const u = new URL(abs); - const isInternal = - domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost; + const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost; if (isInternal || this.download_external_assets) { const file_id = decodeURIComponent(u.pathname); - const paths = this._determine_paths(abs, file_id); + let paths; + try { + paths = this._determine_paths(abs, file_id); + } catch (e) { + console.log(`Invalid path for asset ${abs}: ${e}`); + return; + } if (!paths) return; const { dir_path, file_path } = paths; @@ -300,9 +290,7 @@ class WaybackMachineDownloader { } if (!fs.existsSync(file_path)) { - downloadTasks.push( - this._download_asset(abs, pageTimestamp, file_path, dir_path) - ); + downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path)); } } } catch {} @@ -318,8 +306,7 @@ class WaybackMachineDownloader { try { const abs = new URL(val, pageUrl).toString(); const u = new URL(abs); - const isInternal = - domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost; + const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost; if (isInternal) { const normPath = decodeURIComponent(u.pathname) + (u.hash || ""); @@ -343,17 +330,30 @@ class WaybackMachineDownloader { } } + + // Download one file from the snapshot list (page or asset saved by CDX) async _download_single(file_remote_info, total) { const file_url = String(file_remote_info.file_url); const file_id = file_remote_info.file_id; const file_timestamp = file_remote_info.timestamp; - const paths = this._determine_paths(file_url, file_id); + + let paths; + try { + paths = this._determine_paths(file_url, file_id); + } catch (e) { + console.log(`Invalid path for ${file_url}: ${e}`); + this._processed++; + renderProgress(this._processed, total); + return; + } + if (!paths) { console.log(`Skipping invalid URL: ${file_url}`); this._processed++; renderProgress(this._processed, total); return; } + const { dir_path, file_path } = paths; if (fs.existsSync(file_path)) { @@ -387,11 +387,7 @@ class WaybackMachineDownloader { const contentType = res.headers.get("content-type"); const ext = path.extname(file_path).toLowerCase(); - const looksHtml = - isHtmlFile(file_path, contentType, null) || - ext === "" || - ext === ".html" || - ext === ".htm"; + const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm"; if (looksHtml) { await this._process_html_assets(file_path, file_url, file_timestamp); } @@ -403,43 +399,33 @@ class WaybackMachineDownloader { } } + // Orchestrate downloads with concurrency async download_files() { const startTime = Date.now(); - console.log( - `Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.` - ); + console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`); const list = await this.get_file_list_by_timestamp(); if (list.length === 0) { console.log("No files to download."); return; } - const concurrency = - this.threads_count && this.threads_count > 0 ? this.threads_count : 1; + const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1; const limit = pLimit(concurrency); this._processed = 0; - await Promise.all( - list.map((info) => limit(() => this._download_single(info, list.length))) - ); + await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length)))); const endTime = Date.now(); - console.log( - `\nDownload completed in ${((endTime - startTime) / 1000).toFixed( - 2 - )}s, saved in ${this.backup_path()} (${list.length} files)` - ); + console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`); } } + // ============================= INTERACTIVE RUN ============================= function ask(rl, question) { return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim()))); } async function interactiveMain() { - const rl = readline.createInterface({ - input: process.stdin, - output: process.stdout, - }); + const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); let base_url; while (true) { @@ -463,20 +449,15 @@ async function interactiveMain() { let canonical_action = "keep"; if (rewrite_mode === "relative") { const c = await ask(rl, 'Canonical: "keep" (default) or "remove": '); - if ((c || "").toLowerCase() === "remove") canonical_action = "remove"; + if ((c || '').toLowerCase() === "remove") canonical_action = "remove"; } let threads_count = await ask(rl, "How many download threads? (default 3): "); threads_count = parseInt(threads_count || "3", 10); if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3; - const exact_url = /^y(es)?$/i.test( - await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ") - ); - const directory = await ask( - rl, - "Target directory (leave blank for default websites//): " - ); + const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")); + const directory = await ask(rl, "Target directory (leave blank for default websites//): "); const ext = await ask(rl, "Download external assets? (yes/no, default no): "); const download_external_assets = /^y(es)?$/i.test(ext); diff --git a/wayback-machine-downloader/package.json b/wayback-machine-downloader/package.json index c076d8d..42d5ba7 100644 --- a/wayback-machine-downloader/package.json +++ b/wayback-machine-downloader/package.json @@ -1,6 +1,6 @@ { "name": "wayback-downloader", - "version": "0.2.0", + "version": "0.2.1", "description": "Interactive Wayback Machine downloader for archiving websites locally.", "type": "module", "main": "downloader.js",