init: initial project setup

2026-03-18 14:31:26 +00:00 · 2025-09-27 16:41:13 +02:00
parent dbd2f0adda
commit 7541f2ebee
4 changed files with 656 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -1 +1,113 @@
-# wayback-machine-downloader
+# Wayback Machine Downloader JS
+
+![Web Achive Website Downloader](assets/webarchive-downloader.jpg)
+
+A script written in **Node.js** for downloading websites from [Web Archive](https://web.archive.org/).  
+
+Intended for use by:
+- **Webmasters** — to restore their lost or hacked projects  
+- **OSINT researchers** — for local work with resources that no longer exist  
+
+This webarchive website downloader has an interactive interface, supports downloading with either original links preserved or rewritten into relative ones (for local usage).  
+
+---
+
+## Features of Web Archive Website Downloader
+
+1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets.  
+2. Rewrite internal links for correct local browsing.  
+3. Multithreading support.  
+4. Save results into a chosen folder while keeping the original structure.  
+5. Ability to download external assets (e.g., images or scripts from a CDN).  
+
+#### Special Features
+
+- The script fixes parameterized file names such as `main.css?ver=1.2` into `main.css` for proper local work.  
+
+---
+
+## Requirements
+
+- Node.js version 18.x or higher  
+
+---
+
+## Installation
+
+```bash
+git clone https://github.com/birbwatcher/wayback-machine-downloader.git
+cd wayback-machine-downloader
+
+# Install dependencies
+npm install
+```
+
+---
+
+## Run
+
+```bash
+node downloader.js
+```
+
+After launching, an interactive menu will appear with the following questions:
+
+- base URL (e.g., https://example.com)  
+- date range (from/to)  
+- number of threads  
+- link rewriting mode (keep as-is or convert to relative)  
+- whether to remove `rel=canonical` from the downloaded site  
+- whether to download external assets  
+- directory for saving the files  
+
+---
+
+## Example
+
+```bash
+node downloader.js
+```
+
+Dialog example:  
+
+```bash
+Enter base URL to archive (e.g., https://example.com): https://example.com
+From timestamp (YYYYMMDDhhmmss) or leave blank: 20200101000000
+To timestamp (YYYYMMDDhhmmss) or leave blank: 20201231235959
+Rewrite links? (yes=relative / no=as-is, default no): yes
+Canonical: "keep" (default) or "remove": keep
+How many download threads? (default 3): 5
+Only exact URL (no wildcard /*)? (yes/no, default no): no
+Target directory (leave blank for default websites/<host>/): 
+Download external assets? (yes/no, default no): no
+```
+
+After this, the archive download will begin.  
+
+---
+
+## Common Issues
+
+#### Script downloads only the homepage
+**Answer:** try specifying the base URL with `/*` at the end.  
+For example: `https://example.com/*`, or try downloading a different time range.  
+
+---
+
+## (Important) Download responsibly
+
+Please note that downloading third-party websites may violate copyright laws.  
+Use this tool responsibly and make sure not to break the law.  
+
+---
+
+## Contributing
+
+Pull requests are welcome!  
+For major changes, please open an issue first to discuss what you would like to change.  
+
+1. Fork the project  
+2. Create your feature branch (`git checkout -b feature/fooBar`)  
+3. Commit your changes (`git commit -am 'Add some fooBar'`)  
+4. Push to the branch (`git push origin feature/fooBar`)  
+5. Create a new Pull Request  
--- a/assets/webarchive-downloader.jpg
+++ b/assets/webarchive-downloader.jpg
--- a/wayback-machine-downloader/downloader.js
+++ b/wayback-machine-downloader/downloader.js
@@ -0,0 +1,508 @@
+/*
+ * Wayback Machine Downloader 0.1 by WhitelightSEO — Interactive (Node.js, ESM)
+ * Run: node downloader.js
+ */
+
+import fs from "fs";
+import path from "path";
+import { fileURLToPath, pathToFileURL } from "url";
+import { mkdir } from "fs/promises";
+import pLimit from "p-limit";
+import { load } from "cheerio";
+import { Readable } from "stream";
+import readline from "readline";
+
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+
+// ----------------------------- PROGRESS BAR -----------------------------
+function renderProgress(current, total) {
+  const width = 40;
+  const ratio = total > 0 ? current / total : 0;
+  const filled = Math.round(ratio * width);
+  const bar = "█".repeat(filled) + "-".repeat(width - filled);
+  process.stdout.write(
+    `\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
+  );
+  if (current === total) process.stdout.write("\n");
+}
+
+// ----------------------------- HELPERS -----------------------------
+function toPosix(p) {
+  return p.split(path.sep).join("/");
+}
+function relativeLink(fromDir, toFile) {
+  const rel = path.relative(fromDir, toFile);
+  return toPosix(rel || path.basename(toFile));
+}
+function ensureLocalTargetForPath(pathname) {
+  return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
+    ? path.posix.join(pathname, "index.html")
+    : pathname;
+}
+
+// ----------------------------- HTML CHECK -----------------------------
+function isHtmlFile(filePath, contentType, firstBytes) {
+  if (contentType && /text\/html/i.test(String(contentType))) return true;
+  const ext = path.extname(filePath).toLowerCase();
+  if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
+  const head = (firstBytes || "").toString("utf8", 0, 512);
+  return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
+}
+
+// ----------------------------- Archive API -----------------------------
+async function getRawListFromApi({
+  baseUrl,
+  pageIndex,
+  all,
+  fromTimestamp,
+  toTimestamp,
+}) {
+  const cdx = new URL("https://web.archive.org/cdx/search/xd");
+  const params = new URLSearchParams();
+  params.set("output", "json");
+  params.set("url", baseUrl);
+  params.set("fl", "timestamp,original");
+  params.set("collapse", "digest");
+  params.set("gzip", "false");
+  if (!all) params.append("filter", "statuscode:200");
+  if (fromTimestamp && Number(fromTimestamp) !== 0)
+    params.set("from", String(fromTimestamp));
+  if (toTimestamp && Number(toTimestamp) !== 0)
+    params.set("to", String(toTimestamp));
+  if (pageIndex != null) params.set("page", String(pageIndex));
+  cdx.search = params.toString();
+
+  try {
+    const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
+    const text = await res.text();
+    const json = JSON.parse(text);
+    if (
+      Array.isArray(json) &&
+      Array.isArray(json[0]) &&
+      json[0].join(",") === "timestamp,original"
+    ) {
+      json.shift();
+    }
+    return json || [];
+  } catch (e) {
+    console.log(`ERROR getRawListFromApi: ${e}`);
+    return [];
+  }
+}
+
+// ----------------------------- DOWNLOADER CLASS -----------------------------
+class WaybackMachineDownloader {
+  constructor(params) {
+    this.base_url = params.base_url;
+    this.exact_url = !!params.exact_url;
+    this.directory = params.directory || null;
+    this.from_timestamp = params.from_timestamp
+      ? Number(params.from_timestamp)
+      : 0;
+    this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
+    this.threads_count =
+      params.threads_count != null ? Number(params.threads_count) : 3;
+
+    this.download_external_assets = params.download_external_assets || false;
+
+    this.rewrite_mode = params.rewrite_mode || "as-is";
+    this.rewrite_links = this.rewrite_mode === "relative";
+    this.canonical_action = params.canonical_action || "keep";
+
+    this._processed = 0;
+  }
+
+  backup_name() {
+    try {
+      if (this.base_url.includes("//")) {
+        const u = new URL(this.base_url);
+        return u.host;
+      }
+    } catch {}
+    return this.base_url;
+  }
+  backup_path() {
+    if (this.directory) {
+      return this.directory.endsWith(path.sep)
+        ? this.directory
+        : this.directory + path.sep;
+    }
+    return path.join("websites", this.backup_name(), path.sep);
+  }
+
+  async get_all_snapshots_to_consider() {
+    console.log("Getting snapshot pages");
+    const httpOpts = {
+      all: true,
+      fromTimestamp: this.from_timestamp,
+      toTimestamp: this.to_timestamp,
+    };
+    let list = [];
+
+    list = list.concat(
+      await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
+    );
+    process.stdout.write(".");
+
+    if (!this.exact_url) {
+      const wildcard = this.base_url.endsWith("/*")
+        ? this.base_url
+        : this.base_url.replace(/\/*$/, "") + "/*";
+      for (let i = 0; i < 100; i++) {
+        const batch = await getRawListFromApi({
+          baseUrl: wildcard,
+          pageIndex: i,
+          ...httpOpts,
+        });
+        if (!batch || batch.length === 0) break;
+        list = list.concat(batch);
+        process.stdout.write(".");
+      }
+    }
+    console.log(` found ${list.length} snapshots to consider.\n`);
+    return list;
+  }
+
+  async get_file_list_by_timestamp() {
+    const curated = new Map();
+    const all = await this.get_all_snapshots_to_consider();
+    for (const pair of all) {
+      const ts = pair[0];
+      const url = pair[1];
+      try {
+        const u = new URL(url);
+        const file_id = u.pathname;
+        const prev = curated.get(file_id);
+        if (!prev || prev.timestamp <= ts) {
+          curated.set(file_id, { file_url: url, timestamp: ts, file_id });
+        }
+      } catch {}
+    }
+    const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id }));
+    arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
+    return arr;
+  }
+
+  _windowsSanitize(p) {
+    if (process.platform !== "win32") return p;
+    return p.replace(/[:*?&=<>\\|]/g, (s) =>
+      "%" + s.charCodeAt(0).toString(16)
+    );
+  }
+  async _structure_dir_path(dir_path) {
+    try {
+      await mkdir(dir_path, { recursive: true });
+    } catch (e) {
+      if (!e || e.code !== "EEXIST") throw e;
+    }
+  }
+
+  _determine_paths(file_url, file_id) {
+    if (file_url.startsWith("data:") || file_url.startsWith("javascript:"))
+      return null;
+    if (file_id.length > 200) return null;
+
+    const backup = this.backup_path();
+    const parts = file_id.split("/").filter(Boolean);
+    let dir_path, file_path;
+
+    if (file_id === "") {
+      dir_path = backup;
+      file_path = path.join(backup, "index.html");
+    } else if (
+      file_url.endsWith("/") ||
+      !parts[parts.length - 1].includes(".")
+    ) {
+      dir_path = path.join(backup, ...parts);
+      file_path = path.join(dir_path, "index.html");
+    } else {
+      dir_path = path.join(backup, ...parts.slice(0, -1));
+      file_path = path.join(backup, ...parts);
+    }
+
+    dir_path = this._windowsSanitize(dir_path);
+    file_path = this._windowsSanitize(file_path);
+
+    return { dir_path, file_path };
+  }
+
+  async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
+    try {
+      if (fs.existsSync(file_path)) return file_path;
+
+      await this._structure_dir_path(dir_path);
+      const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`;
+      let res;
+      try {
+        res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
+      } catch (e) {
+        console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
+        return null;
+      }
+      if (!res.ok || !res.body) {
+        console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`);
+        return null;
+      }
+
+      await new Promise((resolve, reject) => {
+        const ws = fs.createWriteStream(file_path);
+        Readable.fromWeb(res.body).pipe(ws);
+        ws.on("finish", resolve);
+        ws.on("error", reject);
+      });
+
+      return file_path;
+    } catch (e) {
+      console.log(`Asset download failed: ${assetUrl} → ${e}`);
+      return null;
+    }
+  }
+
+  async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
+    try {
+      const backupRoot = this.backup_path();
+      let html = fs.readFileSync(htmlPath, "utf8");
+      const $ = load(html);
+      const site = new URL(this.base_url);
+      const siteHost = site.hostname.replace(/^www\./, "");
+      const baseDir = path.dirname(htmlPath);
+
+      const downloadTasks = [];
+
+      // ----------- ASSETS -----------
+      $(
+        "img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
+      ).each((_, el) => {
+        const attr = el.tagName === "link" ? "href" : "src";
+        const val = $(el).attr(attr);
+        if (!val) return;
+
+        try {
+          const abs = new URL(val, pageUrl).toString();
+          const u = new URL(abs);
+          const isInternal = u.hostname.replace(/^www\./, "") === siteHost;
+
+          if (isInternal || this.download_external_assets) {
+            const file_id = u.pathname;
+            const paths = this._determine_paths(abs, file_id);
+            if (!paths) return;
+            const { dir_path, file_path } = paths;
+
+            if (this.rewrite_links) {
+              const normPath = u.pathname + (u.hash || "");
+              const localTarget = ensureLocalTargetForPath(normPath);
+              const localAbsPath = path.join(backupRoot, localTarget);
+              $(el).attr(attr, relativeLink(baseDir, localAbsPath));
+            }
+
+            if (!fs.existsSync(file_path)) {
+              downloadTasks.push(
+                this._download_asset(abs, pageTimestamp, file_path, dir_path)
+              );
+            }
+          }
+        } catch {}
+      });
+
+      // ----------- INTERNAL LINKS (pages/forms) -----------
+      if (this.rewrite_links) {
+        $("a[href], form[action]").each((_, el) => {
+          const attr = el.tagName === "a" ? "href" : "action";
+          const val = $(el).attr(attr);
+          if (!val) return;
+
+          try {
+            const abs = new URL(val, pageUrl).toString();
+            const u = new URL(abs);
+            const isInternal = u.hostname.replace(/^www\./, "") === siteHost;
+
+            if (isInternal) {
+              const normPath = u.pathname + (u.hash || "");
+              const localTarget = ensureLocalTargetForPath(normPath);
+              const localAbsPath = path.join(backupRoot, localTarget);
+              $(el).attr(attr, relativeLink(baseDir, localAbsPath));
+            }
+          } catch {}
+        });
+      }
+
+      await Promise.all(downloadTasks);
+
+      if (this.canonical_action === "remove") {
+        $("link[rel=\"canonical\"]").remove();
+      }
+
+      fs.writeFileSync(htmlPath, $.html(), "utf8");
+    } catch (e) {
+      console.log(`HTML processing error: ${e}`);
+    }
+  }
+
+  async _download_single(file_remote_info, total) {
+    const file_url = String(file_remote_info.file_url);
+    const file_id = file_remote_info.file_id;
+    const file_timestamp = file_remote_info.timestamp;
+    const paths = this._determine_paths(file_url, file_id);
+    if (!paths) {
+      console.log(`Skipping invalid URL: ${file_url}`);
+      this._processed++;
+      renderProgress(this._processed, total);
+      return;
+    }
+    const { dir_path, file_path } = paths;
+
+    if (fs.existsSync(file_path)) {
+      this._processed++;
+      renderProgress(this._processed, total);
+      return;
+    }
+
+    try {
+      await this._structure_dir_path(dir_path);
+      const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
+      let res;
+      try {
+        res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
+      } catch (e) {
+        console.log(`Skipping ${file_url}, fetch failed: ${e}`);
+        return;
+      }
+
+      if (!res.ok || !res.body) {
+        console.log(`Skipping ${file_url}, bad response ${res.status}`);
+        return;
+      }
+
+      await new Promise((resolve, reject) => {
+        const ws = fs.createWriteStream(file_path);
+        Readable.fromWeb(res.body).pipe(ws);
+        ws.on("finish", resolve);
+        ws.on("error", reject);
+      });
+
+      const contentType = res.headers.get("content-type");
+      const ext = path.extname(file_path).toLowerCase();
+      const looksHtml =
+        isHtmlFile(file_path, contentType, null) ||
+        ext === "" ||
+        ext === ".html" ||
+        ext === ".htm";
+      if (looksHtml) {
+        await this._process_html_assets(file_path, file_url, file_timestamp);
+      }
+    } catch (e) {
+      console.log(`Download failed for ${file_url}: ${e}`);
+    } finally {
+      this._processed++;
+      renderProgress(this._processed, total);
+    }
+  }
+
+  async download_files() {
+    const startTime = Date.now();
+    console.log(
+      `Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
+    );
+    const list = await this.get_file_list_by_timestamp();
+    if (list.length === 0) {
+      console.log("No files to download.");
+      return;
+    }
+
+    const concurrency =
+      this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
+    const limit = pLimit(concurrency);
+    this._processed = 0;
+    await Promise.all(
+      list.map((info) => limit(() => this._download_single(info, list.length)))
+    );
+    const endTime = Date.now();
+    console.log(
+      `\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
+        2
+      )}s, saved in ${this.backup_path()} (${list.length} files)`
+    );
+  }
+}
+
+// ============================= INTERACTIVE RUN =============================
+function ask(rl, question) {
+  return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
+}
+
+async function interactiveMain() {
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout,
+  });
+
+  let base_url;
+  while (true) {
+    base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): ");
+    if (!base_url) continue;
+    try {
+      new URL(base_url);
+      break;
+    } catch {
+      console.log("Please enter a valid URL.\n");
+    }
+  }
+
+  const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
+  const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
+
+  let rewrite_mode = "as-is";
+  const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
+  if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
+
+  let canonical_action = "keep";
+  if (rewrite_mode === "relative") {
+    const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
+    if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
+  }
+
+  let threads_count = await ask(rl, "How many download threads? (default 3): ");
+  threads_count = parseInt(threads_count || "3", 10);
+  if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
+
+  const exact_url = /^y(es)?$/i.test(
+    await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")
+  );
+  const directory = await ask(
+    rl,
+    "Target directory (leave blank for default websites/<host>/): "
+  );
+
+  const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
+  const download_external_assets = /^y(es)?$/i.test(ext);
+
+  rl.close();
+
+  const dl = new WaybackMachineDownloader({
+    base_url,
+    exact_url,
+    directory: directory || null,
+    from_timestamp: from_timestamp || 0,
+    to_timestamp: to_timestamp || 0,
+    threads_count,
+    rewrite_mode,
+    canonical_action,
+    download_external_assets,
+  });
+
+  await dl.download_files();
+}
+
+const isDirectRun =
+  import.meta.url === `file://${process.argv[1]}` ||
+  import.meta.url === pathToFileURL(process.argv[1]).href;
+
+if (isDirectRun) {
+  interactiveMain().catch((err) => {
+    console.error(`FATAL: ${err?.stack || err}`);
+    process.exit(1);
+  });
+}
+
+export { WaybackMachineDownloader };
--- a/wayback-machine-downloader/package.json
+++ b/wayback-machine-downloader/package.json
@@ -0,0 +1,35 @@
+{
+  "name": "wayback-downloader",
+  "version": "0.1.0",
+  "description": "Interactive Wayback Machine downloader for archiving websites locally.",
+  "type": "module",
+  "main": "downloader.js",
+  "bin": {
+    "wayback-downloader": "downloader.js"
+  },
+  "scripts": {
+    "start": "node downloader.js"
+  },
+  "dependencies": {
+    "cheerio": "^1.0.0-rc.12",
+    "p-limit": "^4.0.0"
+  },
+  "engines": {
+    "node": ">=18"
+  },
+  "keywords": [
+    "wayback-machine-downloader",
+    "web-archive-downloder",
+    "archiver"
+  ],
+  "author": "birbwatcher",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/birbwatcher/wayback-downloader.git"
+  },
+  "bugs": {
+    "url": "https://github.com/birbwatcher/wayback-downloader/issues"
+  },
+  "homepage": "https://github.com/birbwatcher/wayback-downloader#readme"
+}