fix: prevent crashes on invalid paths and silent CDX JSON parse errors

This commit is contained in:
User
2025-09-29 10:27:38 +02:00
parent f840c4a8f1
commit a6f5ee5e1c
3 changed files with 95 additions and 110 deletions

View File

@@ -50,8 +50,12 @@ This webarchive website downloader has an interactive interface, supports downlo
```bash
git clone https://github.com/birbwatcher/wayback-machine-downloader.git
```
go to inner folder "wayback-machine-downloader"
```bash
cd wayback-machine-downloader
```
```bash
# Install dependencies
npm install
```

View File

@@ -1,5 +1,5 @@
/*
* Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM)
* Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM)
* Run: node downloader.js
*/
@@ -21,9 +21,7 @@ function renderProgress(current, total) {
const ratio = total > 0 ? current / total : 0;
const filled = Math.round(ratio * width);
const bar = "█".repeat(filled) + "-".repeat(width - filled);
process.stdout.write(
`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
);
process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
if (current === total) process.stdout.write("\n");
}
@@ -47,17 +45,12 @@ function isHtmlFile(filePath, contentType, firstBytes) {
const ext = path.extname(filePath).toLowerCase();
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
const head = (firstBytes || "").toString("utf8", 0, 512);
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
return /<!doctype html/i.test(head) || /<html[\\s>]/i.test(head);
}
// ----------------------------- Archive API -----------------------------
async function getRawListFromApi({
baseUrl,
pageIndex,
all,
fromTimestamp,
toTimestamp,
}) {
async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
const cdx = new URL("https://web.archive.org/cdx/search/xd");
const params = new URLSearchParams();
params.set("output", "json");
@@ -66,43 +59,41 @@ async function getRawListFromApi({
params.set("collapse", "digest");
params.set("gzip", "false");
if (!all) params.append("filter", "statuscode:200");
if (fromTimestamp && Number(fromTimestamp) !== 0)
params.set("from", String(fromTimestamp));
if (toTimestamp && Number(toTimestamp) !== 0)
params.set("to", String(toTimestamp));
if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
if (pageIndex != null) params.set("page", String(pageIndex));
cdx.search = params.toString();
try {
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
const text = await res.text();
const json = JSON.parse(text);
if (
Array.isArray(json) &&
Array.isArray(json[0]) &&
json[0].join(",") === "timestamp,original"
) {
let json = [];
try {
json = JSON.parse(text);
} catch {
// silent: treat as empty page
return [];
}
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
json.shift();
}
return json || [];
} catch (e) {
console.log(`ERROR getRawListFromApi: ${e}`);
} catch {
// silent: skip broken page
return [];
}
}
// ----------------------------- DOWNLOADER CLASS -----------------------------
class WaybackMachineDownloader {
constructor(params) {
this.base_url = params.base_url;
this.exact_url = !!params.exact_url;
this.directory = params.directory || null;
this.from_timestamp = params.from_timestamp
? Number(params.from_timestamp)
: 0;
this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
this.threads_count =
params.threads_count != null ? Number(params.threads_count) : 3;
this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
this.download_external_assets = params.download_external_assets || false;
@@ -113,49 +104,38 @@ class WaybackMachineDownloader {
this._processed = 0;
}
// Create a human-readable backup folder name, preserving IDNs
backup_name() {
try {
if (this.base_url.includes("//")) {
const u = new URL(this.base_url);
return domainToUnicode(u.host); // use human-readable domain
return domainToUnicode(u.host);
}
} catch {}
return this.base_url;
}
// Resolve output directory
backup_path() {
if (this.directory) {
return this.directory.endsWith(path.sep)
? this.directory
: this.directory + path.sep;
return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
}
return path.join("websites", this.backup_name(), path.sep);
}
// Fetch and merge snapshot lists
async get_all_snapshots_to_consider() {
console.log("Getting snapshot pages");
const httpOpts = {
all: true,
fromTimestamp: this.from_timestamp,
toTimestamp: this.to_timestamp,
};
const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
let list = [];
list = list.concat(
await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
);
list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }));
process.stdout.write(".");
if (!this.exact_url) {
const wildcard = this.base_url.endsWith("/*")
? this.base_url
: this.base_url.replace(/\/*$/, "") + "/*";
const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*";
for (let i = 0; i < 100; i++) {
const batch = await getRawListFromApi({
baseUrl: wildcard,
pageIndex: i,
...httpOpts,
});
const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
if (!batch || batch.length === 0) break;
list = list.concat(batch);
process.stdout.write(".");
@@ -165,12 +145,15 @@ class WaybackMachineDownloader {
return list;
}
// Choose the latest timestamp per unique pathname
async get_file_list_by_timestamp() {
const curated = new Map();
const all = await this.get_all_snapshots_to_consider();
for (const pair of all) {
const ts = pair[0];
const url = pair[1];
const ts = pair && pair[0];
const url = pair && pair[1];
if (!ts || !url) continue;
try {
const u = new URL(url);
const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
@@ -185,13 +168,13 @@ class WaybackMachineDownloader {
return arr;
}
// Replace Windows-hostile characters when running on Windows
_windowsSanitize(p) {
if (process.platform !== "win32") return p;
return p.replace(/[:*?&=<>\\|]/g, (s) =>
"%" + s.charCodeAt(0).toString(16)
);
return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
}
// Ensure directory exists
async _structure_dir_path(dir_path) {
try {
await mkdir(dir_path, { recursive: true });
@@ -200,9 +183,10 @@ class WaybackMachineDownloader {
}
}
// Compute local file paths for a given archived URL
_determine_paths(file_url, file_id) {
if (file_url.startsWith("data:") || file_url.startsWith("javascript:"))
return null;
if (!file_url || !file_id) return null;
if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null;
if (file_id.length > 200) return null;
const backup = this.backup_path();
@@ -212,15 +196,15 @@ class WaybackMachineDownloader {
if (file_id === "") {
dir_path = backup;
file_path = path.join(backup, "index.html");
} else if (
file_url.endsWith("/") ||
!parts[parts.length - 1].includes(".")
) {
dir_path = path.join(backup, ...parts);
file_path = path.join(dir_path, "index.html");
} else {
dir_path = path.join(backup, ...parts.slice(0, -1));
file_path = path.join(backup, ...parts);
const lastPart = parts[parts.length - 1] || "";
if (file_url.endsWith("/") || !lastPart.includes(".")) {
dir_path = path.join(backup, ...parts);
file_path = path.join(dir_path, "index.html");
} else {
dir_path = path.join(backup, ...parts.slice(0, -1));
file_path = path.join(backup, ...parts);
}
}
dir_path = this._windowsSanitize(dir_path);
@@ -229,6 +213,8 @@ class WaybackMachineDownloader {
return { dir_path, file_path };
}
// Download a single asset (img/css/js/etc.) referenced from an HTML page
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
try {
if (fs.existsSync(file_path)) return file_path;
@@ -261,21 +247,20 @@ class WaybackMachineDownloader {
}
}
// Parse saved HTML, optionally rewrite internal links to relative and fetch assets
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
try {
const backupRoot = this.backup_path();
let html = fs.readFileSync(htmlPath, "utf8");
const $ = load(html);
const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is
const site = new URL(this.base_url);
const siteHost = domainToUnicode(site.hostname.replace(/^www\./, ""));
const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, ""));
const baseDir = path.dirname(htmlPath);
const downloadTasks = [];
// ----------- ASSETS -----------
$(
"img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
).each((_, el) => {
$("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
const attr = el.tagName === "link" ? "href" : "src";
const val = $(el).attr(attr);
if (!val) return;
@@ -283,12 +268,17 @@ class WaybackMachineDownloader {
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const isInternal =
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
if (isInternal || this.download_external_assets) {
const file_id = decodeURIComponent(u.pathname);
const paths = this._determine_paths(abs, file_id);
let paths;
try {
paths = this._determine_paths(abs, file_id);
} catch (e) {
console.log(`Invalid path for asset ${abs}: ${e}`);
return;
}
if (!paths) return;
const { dir_path, file_path } = paths;
@@ -300,9 +290,7 @@ class WaybackMachineDownloader {
}
if (!fs.existsSync(file_path)) {
downloadTasks.push(
this._download_asset(abs, pageTimestamp, file_path, dir_path)
);
downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path));
}
}
} catch {}
@@ -318,8 +306,7 @@ class WaybackMachineDownloader {
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const isInternal =
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
if (isInternal) {
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
@@ -343,17 +330,30 @@ class WaybackMachineDownloader {
}
}
// Download one file from the snapshot list (page or asset saved by CDX)
async _download_single(file_remote_info, total) {
const file_url = String(file_remote_info.file_url);
const file_id = file_remote_info.file_id;
const file_timestamp = file_remote_info.timestamp;
const paths = this._determine_paths(file_url, file_id);
let paths;
try {
paths = this._determine_paths(file_url, file_id);
} catch (e) {
console.log(`Invalid path for ${file_url}: ${e}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
if (!paths) {
console.log(`Skipping invalid URL: ${file_url}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
const { dir_path, file_path } = paths;
if (fs.existsSync(file_path)) {
@@ -387,11 +387,7 @@ class WaybackMachineDownloader {
const contentType = res.headers.get("content-type");
const ext = path.extname(file_path).toLowerCase();
const looksHtml =
isHtmlFile(file_path, contentType, null) ||
ext === "" ||
ext === ".html" ||
ext === ".htm";
const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
if (looksHtml) {
await this._process_html_assets(file_path, file_url, file_timestamp);
}
@@ -403,43 +399,33 @@ class WaybackMachineDownloader {
}
}
// Orchestrate downloads with concurrency
async download_files() {
const startTime = Date.now();
console.log(
`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
);
console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
const list = await this.get_file_list_by_timestamp();
if (list.length === 0) {
console.log("No files to download.");
return;
}
const concurrency =
this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
const limit = pLimit(concurrency);
this._processed = 0;
await Promise.all(
list.map((info) => limit(() => this._download_single(info, list.length)))
);
await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
const endTime = Date.now();
console.log(
`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
2
)}s, saved in ${this.backup_path()} (${list.length} files)`
);
console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
}
}
// ============================= INTERACTIVE RUN =============================
function ask(rl, question) {
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
}
async function interactiveMain() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
let base_url;
while (true) {
@@ -463,20 +449,15 @@ async function interactiveMain() {
let canonical_action = "keep";
if (rewrite_mode === "relative") {
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
if ((c || '').toLowerCase() === "remove") canonical_action = "remove";
}
let threads_count = await ask(rl, "How many download threads? (default 3): ");
threads_count = parseInt(threads_count || "3", 10);
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
const exact_url = /^y(es)?$/i.test(
await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")
);
const directory = await ask(
rl,
"Target directory (leave blank for default websites/<host>/): "
);
const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
const download_external_assets = /^y(es)?$/i.test(ext);

View File

@@ -1,6 +1,6 @@
{
"name": "wayback-downloader",
"version": "0.2.0",
"version": "0.2.1",
"description": "Interactive Wayback Machine downloader for archiving websites locally.",
"type": "module",
"main": "downloader.js",