From a6f5ee5e1c4f31760204e45bf4f8bb25c4bfeaa0 Mon Sep 17 00:00:00 2001
From: User <User@MacBook-Pro.local>
Date: Mon, 29 Sep 2025 10:27:38 +0200
Subject: [PATCH] fix: prevent crashes on invalid paths and silent CDX JSON
 parse errors

---
 README.md                                |   6 +-
 wayback-machine-downloader/downloader.js | 197 ++++++++++-------------
 wayback-machine-downloader/package.json  |   2 +-
 3 files changed, 95 insertions(+), 110 deletions(-)
diff --git a/README.md b/README.md
index 7f3ce30..ccf730f 100644
--- a/README.md
+++ b/README.md
@@ -50,8 +50,12 @@ This webarchive website downloader has an interactive interface, supports downlo
 
 ```bash
 git clone https://github.com/birbwatcher/wayback-machine-downloader.git
+```
+go to inner folder "wayback-machine-downloader"
+```bash
 cd wayback-machine-downloader
-
+```
+```bash
 # Install dependencies
 npm install
 ```
diff --git a/wayback-machine-downloader/downloader.js b/wayback-machine-downloader/downloader.js
index 590c9c9..ca81597 100644
--- a/wayback-machine-downloader/downloader.js
+++ b/wayback-machine-downloader/downloader.js
@@ -1,5 +1,5 @@
 /*
- * Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM)
+ * Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM)
  * Run: node downloader.js
  */
 
@@ -21,9 +21,7 @@ function renderProgress(current, total) {
   const ratio = total > 0 ? current / total : 0;
   const filled = Math.round(ratio * width);
   const bar = "█".repeat(filled) + "-".repeat(width - filled);
-  process.stdout.write(
-    `\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
-  );
+  process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
   if (current === total) process.stdout.write("\n");
 }
 
@@ -47,17 +45,12 @@ function isHtmlFile(filePath, contentType, firstBytes) {
   const ext = path.extname(filePath).toLowerCase();
   if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
   const head = (firstBytes || "").toString("utf8", 0, 512);
-  return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
+  return /<!doctype html/i.test(head) || /<html[\\s>]/i.test(head);
 }
 
+
 // ----------------------------- Archive API -----------------------------
-async function getRawListFromApi({
-  baseUrl,
-  pageIndex,
-  all,
-  fromTimestamp,
-  toTimestamp,
-}) {
+async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
   const cdx = new URL("https://web.archive.org/cdx/search/xd");
   const params = new URLSearchParams();
   params.set("output", "json");
@@ -66,43 +59,41 @@ async function getRawListFromApi({
   params.set("collapse", "digest");
   params.set("gzip", "false");
   if (!all) params.append("filter", "statuscode:200");
-  if (fromTimestamp && Number(fromTimestamp) !== 0)
-    params.set("from", String(fromTimestamp));
-  if (toTimestamp && Number(toTimestamp) !== 0)
-    params.set("to", String(toTimestamp));
+  if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
+  if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
   if (pageIndex != null) params.set("page", String(pageIndex));
   cdx.search = params.toString();
 
   try {
     const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
     const text = await res.text();
-    const json = JSON.parse(text);
-    if (
-      Array.isArray(json) &&
-      Array.isArray(json[0]) &&
-      json[0].join(",") === "timestamp,original"
-    ) {
+    let json = [];
+    try {
+      json = JSON.parse(text);
+    } catch {
+      // silent: treat as empty page
+      return [];
+    }
+    if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
       json.shift();
     }
     return json || [];
-  } catch (e) {
-    console.log(`ERROR getRawListFromApi: ${e}`);
+  } catch {
+    // silent: skip broken page
     return [];
   }
 }
 
+
 // ----------------------------- DOWNLOADER CLASS -----------------------------
 class WaybackMachineDownloader {
   constructor(params) {
     this.base_url = params.base_url;
     this.exact_url = !!params.exact_url;
     this.directory = params.directory || null;
-    this.from_timestamp = params.from_timestamp
-      ? Number(params.from_timestamp)
-      : 0;
+    this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
     this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
-    this.threads_count =
-      params.threads_count != null ? Number(params.threads_count) : 3;
+    this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
 
     this.download_external_assets = params.download_external_assets || false;
 
@@ -113,49 +104,38 @@ class WaybackMachineDownloader {
     this._processed = 0;
   }
 
+  // Create a human-readable backup folder name, preserving IDNs
   backup_name() {
     try {
       if (this.base_url.includes("//")) {
         const u = new URL(this.base_url);
-        return domainToUnicode(u.host); // use human-readable domain
+        return domainToUnicode(u.host);
       }
     } catch {}
     return this.base_url;
   }
 
+  // Resolve output directory
   backup_path() {
     if (this.directory) {
-      return this.directory.endsWith(path.sep)
-        ? this.directory
-        : this.directory + path.sep;
+      return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
     }
     return path.join("websites", this.backup_name(), path.sep);
   }
 
+  // Fetch and merge snapshot lists
   async get_all_snapshots_to_consider() {
     console.log("Getting snapshot pages");
-    const httpOpts = {
-      all: true,
-      fromTimestamp: this.from_timestamp,
-      toTimestamp: this.to_timestamp,
-    };
+    const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
     let list = [];
 
-    list = list.concat(
-      await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
-    );
+    list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }));
     process.stdout.write(".");
 
     if (!this.exact_url) {
-      const wildcard = this.base_url.endsWith("/*")
-        ? this.base_url
-        : this.base_url.replace(/\/*$/, "") + "/*";
+      const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*";
       for (let i = 0; i < 100; i++) {
-        const batch = await getRawListFromApi({
-          baseUrl: wildcard,
-          pageIndex: i,
-          ...httpOpts,
-        });
+        const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
         if (!batch || batch.length === 0) break;
         list = list.concat(batch);
         process.stdout.write(".");
@@ -165,12 +145,15 @@ class WaybackMachineDownloader {
     return list;
   }
 
+
+  // Choose the latest timestamp per unique pathname
   async get_file_list_by_timestamp() {
     const curated = new Map();
     const all = await this.get_all_snapshots_to_consider();
     for (const pair of all) {
-      const ts = pair[0];
-      const url = pair[1];
+      const ts = pair && pair[0];
+      const url = pair && pair[1];
+      if (!ts || !url) continue;
       try {
         const u = new URL(url);
         const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
@@ -185,13 +168,13 @@ class WaybackMachineDownloader {
     return arr;
   }
 
+  // Replace Windows-hostile characters when running on Windows
   _windowsSanitize(p) {
     if (process.platform !== "win32") return p;
-    return p.replace(/[:*?&=<>\\|]/g, (s) =>
-      "%" + s.charCodeAt(0).toString(16)
-    );
+    return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
   }
 
+  // Ensure directory exists
   async _structure_dir_path(dir_path) {
     try {
       await mkdir(dir_path, { recursive: true });
@@ -200,9 +183,10 @@ class WaybackMachineDownloader {
     }
   }
 
+  // Compute local file paths for a given archived URL
   _determine_paths(file_url, file_id) {
-    if (file_url.startsWith("data:") || file_url.startsWith("javascript:"))
-      return null;
+    if (!file_url || !file_id) return null;
+    if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null;
     if (file_id.length > 200) return null;
 
     const backup = this.backup_path();
@@ -212,15 +196,15 @@ class WaybackMachineDownloader {
     if (file_id === "") {
       dir_path = backup;
       file_path = path.join(backup, "index.html");
-    } else if (
-      file_url.endsWith("/") ||
-      !parts[parts.length - 1].includes(".")
-    ) {
-      dir_path = path.join(backup, ...parts);
-      file_path = path.join(dir_path, "index.html");
     } else {
-      dir_path = path.join(backup, ...parts.slice(0, -1));
-      file_path = path.join(backup, ...parts);
+      const lastPart = parts[parts.length - 1] || "";
+      if (file_url.endsWith("/") || !lastPart.includes(".")) {
+        dir_path = path.join(backup, ...parts);
+        file_path = path.join(dir_path, "index.html");
+      } else {
+        dir_path = path.join(backup, ...parts.slice(0, -1));
+        file_path = path.join(backup, ...parts);
+      }
     }
 
     dir_path = this._windowsSanitize(dir_path);
@@ -229,6 +213,8 @@ class WaybackMachineDownloader {
     return { dir_path, file_path };
   }
 
+
+  // Download a single asset (img/css/js/etc.) referenced from an HTML page
   async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
     try {
       if (fs.existsSync(file_path)) return file_path;
@@ -261,21 +247,20 @@ class WaybackMachineDownloader {
     }
   }
 
+  // Parse saved HTML, optionally rewrite internal links to relative and fetch assets
   async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
     try {
       const backupRoot = this.backup_path();
       let html = fs.readFileSync(htmlPath, "utf8");
-      const $ = load(html);
+      const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is
       const site = new URL(this.base_url);
-      const siteHost = domainToUnicode(site.hostname.replace(/^www\./, ""));
+      const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, ""));
       const baseDir = path.dirname(htmlPath);
 
       const downloadTasks = [];
 
       // ----------- ASSETS -----------
-      $(
-        "img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
-      ).each((_, el) => {
+      $("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
         const attr = el.tagName === "link" ? "href" : "src";
         const val = $(el).attr(attr);
         if (!val) return;
@@ -283,12 +268,17 @@ class WaybackMachineDownloader {
         try {
           const abs = new URL(val, pageUrl).toString();
           const u = new URL(abs);
-          const isInternal =
-            domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
+          const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
 
           if (isInternal || this.download_external_assets) {
             const file_id = decodeURIComponent(u.pathname);
-            const paths = this._determine_paths(abs, file_id);
+            let paths;
+            try {
+              paths = this._determine_paths(abs, file_id);
+            } catch (e) {
+              console.log(`Invalid path for asset ${abs}: ${e}`);
+              return;
+            }
             if (!paths) return;
             const { dir_path, file_path } = paths;
 
@@ -300,9 +290,7 @@ class WaybackMachineDownloader {
             }
 
             if (!fs.existsSync(file_path)) {
-              downloadTasks.push(
-                this._download_asset(abs, pageTimestamp, file_path, dir_path)
-              );
+              downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path));
             }
           }
         } catch {}
@@ -318,8 +306,7 @@ class WaybackMachineDownloader {
           try {
             const abs = new URL(val, pageUrl).toString();
             const u = new URL(abs);
-            const isInternal =
-              domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
+            const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
 
             if (isInternal) {
               const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
@@ -343,17 +330,30 @@ class WaybackMachineDownloader {
     }
   }
 
+
+  // Download one file from the snapshot list (page or asset saved by CDX)
   async _download_single(file_remote_info, total) {
     const file_url = String(file_remote_info.file_url);
     const file_id = file_remote_info.file_id;
     const file_timestamp = file_remote_info.timestamp;
-    const paths = this._determine_paths(file_url, file_id);
+
+    let paths;
+    try {
+      paths = this._determine_paths(file_url, file_id);
+    } catch (e) {
+      console.log(`Invalid path for ${file_url}: ${e}`);
+      this._processed++;
+      renderProgress(this._processed, total);
+      return;
+    }
+
     if (!paths) {
       console.log(`Skipping invalid URL: ${file_url}`);
       this._processed++;
       renderProgress(this._processed, total);
       return;
     }
+
     const { dir_path, file_path } = paths;
 
     if (fs.existsSync(file_path)) {
@@ -387,11 +387,7 @@ class WaybackMachineDownloader {
 
       const contentType = res.headers.get("content-type");
       const ext = path.extname(file_path).toLowerCase();
-      const looksHtml =
-        isHtmlFile(file_path, contentType, null) ||
-        ext === "" ||
-        ext === ".html" ||
-        ext === ".htm";
+      const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
       if (looksHtml) {
         await this._process_html_assets(file_path, file_url, file_timestamp);
       }
@@ -403,43 +399,33 @@ class WaybackMachineDownloader {
     }
   }
 
+  // Orchestrate downloads with concurrency
   async download_files() {
     const startTime = Date.now();
-    console.log(
-      `Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
-    );
+    console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
     const list = await this.get_file_list_by_timestamp();
     if (list.length === 0) {
       console.log("No files to download.");
       return;
     }
 
-    const concurrency =
-      this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
+    const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
     const limit = pLimit(concurrency);
     this._processed = 0;
-    await Promise.all(
-      list.map((info) => limit(() => this._download_single(info, list.length)))
-    );
+    await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
     const endTime = Date.now();
-    console.log(
-      `\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
-        2
-      )}s, saved in ${this.backup_path()} (${list.length} files)`
-    );
+    console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
   }
 }
 
+
 // ============================= INTERACTIVE RUN =============================
 function ask(rl, question) {
   return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
 }
 
 async function interactiveMain() {
-  const rl = readline.createInterface({
-    input: process.stdin,
-    output: process.stdout,
-  });
+  const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
 
   let base_url;
   while (true) {
@@ -463,20 +449,15 @@ async function interactiveMain() {
   let canonical_action = "keep";
   if (rewrite_mode === "relative") {
     const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
-    if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
+    if ((c || '').toLowerCase() === "remove") canonical_action = "remove";
   }
 
   let threads_count = await ask(rl, "How many download threads? (default 3): ");
   threads_count = parseInt(threads_count || "3", 10);
   if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
 
-  const exact_url = /^y(es)?$/i.test(
-    await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")
-  );
-  const directory = await ask(
-    rl,
-    "Target directory (leave blank for default websites/<host>/): "
-  );
+  const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
+  const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
 
   const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
   const download_external_assets = /^y(es)?$/i.test(ext);
diff --git a/wayback-machine-downloader/package.json b/wayback-machine-downloader/package.json
index c076d8d..42d5ba7 100644
--- a/wayback-machine-downloader/package.json
+++ b/wayback-machine-downloader/package.json
@@ -1,6 +1,6 @@
 {
   "name": "wayback-downloader",
-  "version": "0.2.0",
+  "version": "0.2.1",
   "description": "Interactive Wayback Machine downloader for archiving websites locally.",
   "type": "module",
   "main": "downloader.js",