fix: prevent crashes on invalid paths and silent CDX JSON parse errors

2026-03-17 22:12:31 +00:00 · 2025-09-29 10:27:38 +02:00
parent f840c4a8f1
commit a6f5ee5e1c
3 changed files with 95 additions and 110 deletions
--- a/README.md
+++ b/README.md
@@ -50,8 +50,12 @@ This webarchive website downloader has an interactive interface, supports downlo

 ```bash
 git clone https://github.com/birbwatcher/wayback-machine-downloader.git
+```
+go to inner folder "wayback-machine-downloader"
+```bash
 cd wayback-machine-downloader
-
+```
+```bash
 # Install dependencies
 npm install
 ```
--- a/wayback-machine-downloader/downloader.js
+++ b/wayback-machine-downloader/downloader.js
@@ -1,5 +1,5 @@
 /*
- * Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM)
+ * Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM)
 * Run: node downloader.js
 */

@@ -21,9 +21,7 @@ function renderProgress(current, total) {
  const ratio = total > 0 ? current / total : 0;
  const filled = Math.round(ratio * width);
  const bar = "█".repeat(filled) + "-".repeat(width - filled);
-  process.stdout.write(
-    `\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
-  );
+  process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
  if (current === total) process.stdout.write("\n");
 }

@@ -47,17 +45,12 @@ function isHtmlFile(filePath, contentType, firstBytes) {
  const ext = path.extname(filePath).toLowerCase();
  if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
  const head = (firstBytes || "").toString("utf8", 0, 512);
-  return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
+  return /<!doctype html/i.test(head) || /<html[\\s>]/i.test(head);
 }

+
 // ----------------------------- Archive API -----------------------------
-async function getRawListFromApi({
-  baseUrl,
-  pageIndex,
-  all,
-  fromTimestamp,
-  toTimestamp,
-}) {
+async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
  const cdx = new URL("https://web.archive.org/cdx/search/xd");
  const params = new URLSearchParams();
  params.set("output", "json");
@@ -66,43 +59,41 @@ async function getRawListFromApi({
  params.set("collapse", "digest");
  params.set("gzip", "false");
  if (!all) params.append("filter", "statuscode:200");
-  if (fromTimestamp && Number(fromTimestamp) !== 0)
-    params.set("from", String(fromTimestamp));
-  if (toTimestamp && Number(toTimestamp) !== 0)
-    params.set("to", String(toTimestamp));
+  if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
+  if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
  if (pageIndex != null) params.set("page", String(pageIndex));
  cdx.search = params.toString();

  try {
    const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
    const text = await res.text();
-    const json = JSON.parse(text);
-    if (
-      Array.isArray(json) &&
-      Array.isArray(json[0]) &&
-      json[0].join(",") === "timestamp,original"
-    ) {
+    let json = [];
+    try {
+      json = JSON.parse(text);
+    } catch {
+      // silent: treat as empty page
+      return [];
+    }
+    if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
      json.shift();
    }
    return json || [];
-  } catch (e) {
-    console.log(`ERROR getRawListFromApi: ${e}`);
+  } catch {
+    // silent: skip broken page
    return [];
  }
 }

+
 // ----------------------------- DOWNLOADER CLASS -----------------------------
 class WaybackMachineDownloader {
  constructor(params) {
    this.base_url = params.base_url;
    this.exact_url = !!params.exact_url;
    this.directory = params.directory || null;
-    this.from_timestamp = params.from_timestamp
-      ? Number(params.from_timestamp)
-      : 0;
+    this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
    this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
-    this.threads_count =
-      params.threads_count != null ? Number(params.threads_count) : 3;
+    this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;

    this.download_external_assets = params.download_external_assets || false;

@@ -113,49 +104,38 @@ class WaybackMachineDownloader {
    this._processed = 0;
  }

+  // Create a human-readable backup folder name, preserving IDNs
  backup_name() {
    try {
      if (this.base_url.includes("//")) {
        const u = new URL(this.base_url);
-        return domainToUnicode(u.host); // use human-readable domain
+        return domainToUnicode(u.host);
      }
    } catch {}
    return this.base_url;
  }

+  // Resolve output directory
  backup_path() {
    if (this.directory) {
-      return this.directory.endsWith(path.sep)
-        ? this.directory
-        : this.directory + path.sep;
+      return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
    }
    return path.join("websites", this.backup_name(), path.sep);
  }

+  // Fetch and merge snapshot lists
  async get_all_snapshots_to_consider() {
    console.log("Getting snapshot pages");
-    const httpOpts = {
-      all: true,
-      fromTimestamp: this.from_timestamp,
-      toTimestamp: this.to_timestamp,
-    };
+    const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
    let list = [];

-    list = list.concat(
-      await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
-    );
+    list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }));
    process.stdout.write(".");

    if (!this.exact_url) {
-      const wildcard = this.base_url.endsWith("/*")
-        ? this.base_url
-        : this.base_url.replace(/\/*$/, "") + "/*";
+      const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*";
      for (let i = 0; i < 100; i++) {
-        const batch = await getRawListFromApi({
-          baseUrl: wildcard,
-          pageIndex: i,
-          ...httpOpts,
-        });
+        const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
        if (!batch || batch.length === 0) break;
        list = list.concat(batch);
        process.stdout.write(".");
@@ -165,12 +145,15 @@ class WaybackMachineDownloader {
    return list;
  }

+
+  // Choose the latest timestamp per unique pathname
  async get_file_list_by_timestamp() {
    const curated = new Map();
    const all = await this.get_all_snapshots_to_consider();
    for (const pair of all) {
-      const ts = pair[0];
-      const url = pair[1];
+      const ts = pair && pair[0];
+      const url = pair && pair[1];
+      if (!ts || !url) continue;
      try {
        const u = new URL(url);
        const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
@@ -185,13 +168,13 @@ class WaybackMachineDownloader {
    return arr;
  }

+  // Replace Windows-hostile characters when running on Windows
  _windowsSanitize(p) {
    if (process.platform !== "win32") return p;
-    return p.replace(/[:*?&=<>\\|]/g, (s) =>
-      "%" + s.charCodeAt(0).toString(16)
-    );
+    return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
  }

+  // Ensure directory exists
  async _structure_dir_path(dir_path) {
    try {
      await mkdir(dir_path, { recursive: true });
@@ -200,9 +183,10 @@ class WaybackMachineDownloader {
    }
  }

+  // Compute local file paths for a given archived URL
  _determine_paths(file_url, file_id) {
-    if (file_url.startsWith("data:") || file_url.startsWith("javascript:"))
-      return null;
+    if (!file_url || !file_id) return null;
+    if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null;
    if (file_id.length > 200) return null;

    const backup = this.backup_path();
@@ -212,15 +196,15 @@ class WaybackMachineDownloader {
    if (file_id === "") {
      dir_path = backup;
      file_path = path.join(backup, "index.html");
-    } else if (
-      file_url.endsWith("/") ||
-      !parts[parts.length - 1].includes(".")
-    ) {
-      dir_path = path.join(backup, ...parts);
-      file_path = path.join(dir_path, "index.html");
    } else {
-      dir_path = path.join(backup, ...parts.slice(0, -1));
-      file_path = path.join(backup, ...parts);
+      const lastPart = parts[parts.length - 1] || "";
+      if (file_url.endsWith("/") || !lastPart.includes(".")) {
+        dir_path = path.join(backup, ...parts);
+        file_path = path.join(dir_path, "index.html");
+      } else {
+        dir_path = path.join(backup, ...parts.slice(0, -1));
+        file_path = path.join(backup, ...parts);
+      }
    }

    dir_path = this._windowsSanitize(dir_path);
@@ -229,6 +213,8 @@ class WaybackMachineDownloader {
    return { dir_path, file_path };
  }

+
+  // Download a single asset (img/css/js/etc.) referenced from an HTML page
  async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
    try {
      if (fs.existsSync(file_path)) return file_path;
@@ -261,21 +247,20 @@ class WaybackMachineDownloader {
    }
  }

+  // Parse saved HTML, optionally rewrite internal links to relative and fetch assets
  async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
    try {
      const backupRoot = this.backup_path();
      let html = fs.readFileSync(htmlPath, "utf8");
-      const $ = load(html);
+      const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is
      const site = new URL(this.base_url);
-      const siteHost = domainToUnicode(site.hostname.replace(/^www\./, ""));
+      const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, ""));
      const baseDir = path.dirname(htmlPath);

      const downloadTasks = [];

      // ----------- ASSETS -----------
-      $(
-        "img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
-      ).each((_, el) => {
+      $("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
        const attr = el.tagName === "link" ? "href" : "src";
        const val = $(el).attr(attr);
        if (!val) return;
@@ -283,12 +268,17 @@ class WaybackMachineDownloader {
        try {
          const abs = new URL(val, pageUrl).toString();
          const u = new URL(abs);
-          const isInternal =
-            domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
+          const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;

          if (isInternal || this.download_external_assets) {
            const file_id = decodeURIComponent(u.pathname);
-            const paths = this._determine_paths(abs, file_id);
+            let paths;
+            try {
+              paths = this._determine_paths(abs, file_id);
+            } catch (e) {
+              console.log(`Invalid path for asset ${abs}: ${e}`);
+              return;
+            }
            if (!paths) return;
            const { dir_path, file_path } = paths;

@@ -300,9 +290,7 @@ class WaybackMachineDownloader {
            }

            if (!fs.existsSync(file_path)) {
-              downloadTasks.push(
-                this._download_asset(abs, pageTimestamp, file_path, dir_path)
-              );
+              downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path));
            }
          }
        } catch {}
@@ -318,8 +306,7 @@ class WaybackMachineDownloader {
          try {
            const abs = new URL(val, pageUrl).toString();
            const u = new URL(abs);
-            const isInternal =
-              domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
+            const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;

            if (isInternal) {
              const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
@@ -343,17 +330,30 @@ class WaybackMachineDownloader {
    }
  }

+
+  // Download one file from the snapshot list (page or asset saved by CDX)
  async _download_single(file_remote_info, total) {
    const file_url = String(file_remote_info.file_url);
    const file_id = file_remote_info.file_id;
    const file_timestamp = file_remote_info.timestamp;
-    const paths = this._determine_paths(file_url, file_id);
+
+    let paths;
+    try {
+      paths = this._determine_paths(file_url, file_id);
+    } catch (e) {
+      console.log(`Invalid path for ${file_url}: ${e}`);
+      this._processed++;
+      renderProgress(this._processed, total);
+      return;
+    }
+
    if (!paths) {
      console.log(`Skipping invalid URL: ${file_url}`);
      this._processed++;
      renderProgress(this._processed, total);
      return;
    }
+
    const { dir_path, file_path } = paths;

    if (fs.existsSync(file_path)) {
@@ -387,11 +387,7 @@ class WaybackMachineDownloader {

      const contentType = res.headers.get("content-type");
      const ext = path.extname(file_path).toLowerCase();
-      const looksHtml =
-        isHtmlFile(file_path, contentType, null) ||
-        ext === "" ||
-        ext === ".html" ||
-        ext === ".htm";
+      const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
      if (looksHtml) {
        await this._process_html_assets(file_path, file_url, file_timestamp);
      }
@@ -403,43 +399,33 @@ class WaybackMachineDownloader {
    }
  }

+  // Orchestrate downloads with concurrency
  async download_files() {
    const startTime = Date.now();
-    console.log(
-      `Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
-    );
+    console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
    const list = await this.get_file_list_by_timestamp();
    if (list.length === 0) {
      console.log("No files to download.");
      return;
    }

-    const concurrency =
-      this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
+    const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
    const limit = pLimit(concurrency);
    this._processed = 0;
-    await Promise.all(
-      list.map((info) => limit(() => this._download_single(info, list.length)))
-    );
+    await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
    const endTime = Date.now();
-    console.log(
-      `\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
-        2
-      )}s, saved in ${this.backup_path()} (${list.length} files)`
-    );
+    console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
  }
 }

+
 // ============================= INTERACTIVE RUN =============================
 function ask(rl, question) {
  return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
 }

 async function interactiveMain() {
-  const rl = readline.createInterface({
-    input: process.stdin,
-    output: process.stdout,
-  });
+  const rl = readline.createInterface({ input: process.stdin, output: process.stdout });

  let base_url;
  while (true) {
@@ -463,20 +449,15 @@ async function interactiveMain() {
  let canonical_action = "keep";
  if (rewrite_mode === "relative") {
    const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
-    if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
+    if ((c || '').toLowerCase() === "remove") canonical_action = "remove";
  }

  let threads_count = await ask(rl, "How many download threads? (default 3): ");
  threads_count = parseInt(threads_count || "3", 10);
  if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;

-  const exact_url = /^y(es)?$/i.test(
-    await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")
-  );
-  const directory = await ask(
-    rl,
-    "Target directory (leave blank for default websites/<host>/): "
-  );
+  const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
+  const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");

  const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
  const download_external_assets = /^y(es)?$/i.test(ext);
--- a/wayback-machine-downloader/package.json
+++ b/wayback-machine-downloader/package.json
@@ -1,6 +1,6 @@
 {
  "name": "wayback-downloader",
-  "version": "0.2.0",
+  "version": "0.2.1",
  "description": "Interactive Wayback Machine downloader for archiving websites locally.",
  "type": "module",
  "main": "downloader.js",