diff --git a/README.md b/README.md index 8f3501a..c6e24df 100644 --- a/README.md +++ b/README.md @@ -1 +1,113 @@ -# wayback-machine-downloader \ No newline at end of file +# Wayback Machine Downloader JS + +![Web Achive Website Downloader](assets/webarchive-downloader.jpg) + +A script written in **Node.js** for downloading websites from [Web Archive](https://web.archive.org/). + +Intended for use by: +- **Webmasters** — to restore their lost or hacked projects +- **OSINT researchers** — for local work with resources that no longer exist + +This webarchive website downloader has an interactive interface, supports downloading with either original links preserved or rewritten into relative ones (for local usage). + +--- + +## Features of Web Archive Website Downloader + +1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets. +2. Rewrite internal links for correct local browsing. +3. Multithreading support. +4. Save results into a chosen folder while keeping the original structure. +5. Ability to download external assets (e.g., images or scripts from a CDN). + +#### Special Features + +- The script fixes parameterized file names such as `main.css?ver=1.2` into `main.css` for proper local work. + +--- + +## Requirements + +- Node.js version 18.x or higher + +--- + +## Installation + +```bash +git clone https://github.com/birbwatcher/wayback-machine-downloader.git +cd wayback-machine-downloader + +# Install dependencies +npm install +``` + +--- + +## Run + +```bash +node downloader.js +``` + +After launching, an interactive menu will appear with the following questions: + +- base URL (e.g., https://example.com) +- date range (from/to) +- number of threads +- link rewriting mode (keep as-is or convert to relative) +- whether to remove `rel=canonical` from the downloaded site +- whether to download external assets +- directory for saving the files + +--- + +## Example + +```bash +node downloader.js +``` + +Dialog example: + +```bash +Enter base URL to archive (e.g., https://example.com): https://example.com +From timestamp (YYYYMMDDhhmmss) or leave blank: 20200101000000 +To timestamp (YYYYMMDDhhmmss) or leave blank: 20201231235959 +Rewrite links? (yes=relative / no=as-is, default no): yes +Canonical: "keep" (default) or "remove": keep +How many download threads? (default 3): 5 +Only exact URL (no wildcard /*)? (yes/no, default no): no +Target directory (leave blank for default websites//): +Download external assets? (yes/no, default no): no +``` + +After this, the archive download will begin. + +--- + +## Common Issues + +#### Script downloads only the homepage +**Answer:** try specifying the base URL with `/*` at the end. +For example: `https://example.com/*`, or try downloading a different time range. + +--- + +## (Important) Download responsibly + +Please note that downloading third-party websites may violate copyright laws. +Use this tool responsibly and make sure not to break the law. + +--- + +## Contributing + +Pull requests are welcome! +For major changes, please open an issue first to discuss what you would like to change. + +1. Fork the project +2. Create your feature branch (`git checkout -b feature/fooBar`) +3. Commit your changes (`git commit -am 'Add some fooBar'`) +4. Push to the branch (`git push origin feature/fooBar`) +5. Create a new Pull Request diff --git a/assets/webarchive-downloader.jpg b/assets/webarchive-downloader.jpg new file mode 100644 index 0000000..75f3e9f Binary files /dev/null and b/assets/webarchive-downloader.jpg differ diff --git a/wayback-machine-downloader/downloader.js b/wayback-machine-downloader/downloader.js new file mode 100644 index 0000000..d20a86b --- /dev/null +++ b/wayback-machine-downloader/downloader.js @@ -0,0 +1,508 @@ +/* + * Wayback Machine Downloader 0.1 by WhitelightSEO — Interactive (Node.js, ESM) + * Run: node downloader.js + */ + +import fs from "fs"; +import path from "path"; +import { fileURLToPath, pathToFileURL } from "url"; +import { mkdir } from "fs/promises"; +import pLimit from "p-limit"; +import { load } from "cheerio"; +import { Readable } from "stream"; +import readline from "readline"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +// ----------------------------- PROGRESS BAR ----------------------------- +function renderProgress(current, total) { + const width = 40; + const ratio = total > 0 ? current / total : 0; + const filled = Math.round(ratio * width); + const bar = "█".repeat(filled) + "-".repeat(width - filled); + process.stdout.write( + `\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})` + ); + if (current === total) process.stdout.write("\n"); +} + +// ----------------------------- HELPERS ----------------------------- +function toPosix(p) { + return p.split(path.sep).join("/"); +} +function relativeLink(fromDir, toFile) { + const rel = path.relative(fromDir, toFile); + return toPosix(rel || path.basename(toFile)); +} +function ensureLocalTargetForPath(pathname) { + return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".") + ? path.posix.join(pathname, "index.html") + : pathname; +} + +// ----------------------------- HTML CHECK ----------------------------- +function isHtmlFile(filePath, contentType, firstBytes) { + if (contentType && /text\/html/i.test(String(contentType))) return true; + const ext = path.extname(filePath).toLowerCase(); + if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true; + const head = (firstBytes || "").toString("utf8", 0, 512); + return /]/i.test(head); +} + +// ----------------------------- Archive API ----------------------------- +async function getRawListFromApi({ + baseUrl, + pageIndex, + all, + fromTimestamp, + toTimestamp, +}) { + const cdx = new URL("https://web.archive.org/cdx/search/xd"); + const params = new URLSearchParams(); + params.set("output", "json"); + params.set("url", baseUrl); + params.set("fl", "timestamp,original"); + params.set("collapse", "digest"); + params.set("gzip", "false"); + if (!all) params.append("filter", "statuscode:200"); + if (fromTimestamp && Number(fromTimestamp) !== 0) + params.set("from", String(fromTimestamp)); + if (toTimestamp && Number(toTimestamp) !== 0) + params.set("to", String(toTimestamp)); + if (pageIndex != null) params.set("page", String(pageIndex)); + cdx.search = params.toString(); + + try { + const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" }); + const text = await res.text(); + const json = JSON.parse(text); + if ( + Array.isArray(json) && + Array.isArray(json[0]) && + json[0].join(",") === "timestamp,original" + ) { + json.shift(); + } + return json || []; + } catch (e) { + console.log(`ERROR getRawListFromApi: ${e}`); + return []; + } +} + +// ----------------------------- DOWNLOADER CLASS ----------------------------- +class WaybackMachineDownloader { + constructor(params) { + this.base_url = params.base_url; + this.exact_url = !!params.exact_url; + this.directory = params.directory || null; + this.from_timestamp = params.from_timestamp + ? Number(params.from_timestamp) + : 0; + this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0; + this.threads_count = + params.threads_count != null ? Number(params.threads_count) : 3; + + this.download_external_assets = params.download_external_assets || false; + + this.rewrite_mode = params.rewrite_mode || "as-is"; + this.rewrite_links = this.rewrite_mode === "relative"; + this.canonical_action = params.canonical_action || "keep"; + + this._processed = 0; + } + + backup_name() { + try { + if (this.base_url.includes("//")) { + const u = new URL(this.base_url); + return u.host; + } + } catch {} + return this.base_url; + } + backup_path() { + if (this.directory) { + return this.directory.endsWith(path.sep) + ? this.directory + : this.directory + path.sep; + } + return path.join("websites", this.backup_name(), path.sep); + } + + async get_all_snapshots_to_consider() { + console.log("Getting snapshot pages"); + const httpOpts = { + all: true, + fromTimestamp: this.from_timestamp, + toTimestamp: this.to_timestamp, + }; + let list = []; + + list = list.concat( + await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }) + ); + process.stdout.write("."); + + if (!this.exact_url) { + const wildcard = this.base_url.endsWith("/*") + ? this.base_url + : this.base_url.replace(/\/*$/, "") + "/*"; + for (let i = 0; i < 100; i++) { + const batch = await getRawListFromApi({ + baseUrl: wildcard, + pageIndex: i, + ...httpOpts, + }); + if (!batch || batch.length === 0) break; + list = list.concat(batch); + process.stdout.write("."); + } + } + console.log(` found ${list.length} snapshots to consider.\n`); + return list; + } + + async get_file_list_by_timestamp() { + const curated = new Map(); + const all = await this.get_all_snapshots_to_consider(); + for (const pair of all) { + const ts = pair[0]; + const url = pair[1]; + try { + const u = new URL(url); + const file_id = u.pathname; + const prev = curated.get(file_id); + if (!prev || prev.timestamp <= ts) { + curated.set(file_id, { file_url: url, timestamp: ts, file_id }); + } + } catch {} + } + const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id })); + arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp))); + return arr; + } + + _windowsSanitize(p) { + if (process.platform !== "win32") return p; + return p.replace(/[:*?&=<>\\|]/g, (s) => + "%" + s.charCodeAt(0).toString(16) + ); + } + async _structure_dir_path(dir_path) { + try { + await mkdir(dir_path, { recursive: true }); + } catch (e) { + if (!e || e.code !== "EEXIST") throw e; + } + } + + _determine_paths(file_url, file_id) { + if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) + return null; + if (file_id.length > 200) return null; + + const backup = this.backup_path(); + const parts = file_id.split("/").filter(Boolean); + let dir_path, file_path; + + if (file_id === "") { + dir_path = backup; + file_path = path.join(backup, "index.html"); + } else if ( + file_url.endsWith("/") || + !parts[parts.length - 1].includes(".") + ) { + dir_path = path.join(backup, ...parts); + file_path = path.join(dir_path, "index.html"); + } else { + dir_path = path.join(backup, ...parts.slice(0, -1)); + file_path = path.join(backup, ...parts); + } + + dir_path = this._windowsSanitize(dir_path); + file_path = this._windowsSanitize(file_path); + + return { dir_path, file_path }; + } + + async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) { + try { + if (fs.existsSync(file_path)) return file_path; + + await this._structure_dir_path(dir_path); + const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`; + let res; + try { + res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" }); + } catch (e) { + console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`); + return null; + } + if (!res.ok || !res.body) { + console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`); + return null; + } + + await new Promise((resolve, reject) => { + const ws = fs.createWriteStream(file_path); + Readable.fromWeb(res.body).pipe(ws); + ws.on("finish", resolve); + ws.on("error", reject); + }); + + return file_path; + } catch (e) { + console.log(`Asset download failed: ${assetUrl} → ${e}`); + return null; + } + } + + async _process_html_assets(htmlPath, pageUrl, pageTimestamp) { + try { + const backupRoot = this.backup_path(); + let html = fs.readFileSync(htmlPath, "utf8"); + const $ = load(html); + const site = new URL(this.base_url); + const siteHost = site.hostname.replace(/^www\./, ""); + const baseDir = path.dirname(htmlPath); + + const downloadTasks = []; + + // ----------- ASSETS ----------- + $( + "img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]" + ).each((_, el) => { + const attr = el.tagName === "link" ? "href" : "src"; + const val = $(el).attr(attr); + if (!val) return; + + try { + const abs = new URL(val, pageUrl).toString(); + const u = new URL(abs); + const isInternal = u.hostname.replace(/^www\./, "") === siteHost; + + if (isInternal || this.download_external_assets) { + const file_id = u.pathname; + const paths = this._determine_paths(abs, file_id); + if (!paths) return; + const { dir_path, file_path } = paths; + + if (this.rewrite_links) { + const normPath = u.pathname + (u.hash || ""); + const localTarget = ensureLocalTargetForPath(normPath); + const localAbsPath = path.join(backupRoot, localTarget); + $(el).attr(attr, relativeLink(baseDir, localAbsPath)); + } + + if (!fs.existsSync(file_path)) { + downloadTasks.push( + this._download_asset(abs, pageTimestamp, file_path, dir_path) + ); + } + } + } catch {} + }); + + // ----------- INTERNAL LINKS (pages/forms) ----------- + if (this.rewrite_links) { + $("a[href], form[action]").each((_, el) => { + const attr = el.tagName === "a" ? "href" : "action"; + const val = $(el).attr(attr); + if (!val) return; + + try { + const abs = new URL(val, pageUrl).toString(); + const u = new URL(abs); + const isInternal = u.hostname.replace(/^www\./, "") === siteHost; + + if (isInternal) { + const normPath = u.pathname + (u.hash || ""); + const localTarget = ensureLocalTargetForPath(normPath); + const localAbsPath = path.join(backupRoot, localTarget); + $(el).attr(attr, relativeLink(baseDir, localAbsPath)); + } + } catch {} + }); + } + + await Promise.all(downloadTasks); + + if (this.canonical_action === "remove") { + $("link[rel=\"canonical\"]").remove(); + } + + fs.writeFileSync(htmlPath, $.html(), "utf8"); + } catch (e) { + console.log(`HTML processing error: ${e}`); + } + } + + async _download_single(file_remote_info, total) { + const file_url = String(file_remote_info.file_url); + const file_id = file_remote_info.file_id; + const file_timestamp = file_remote_info.timestamp; + const paths = this._determine_paths(file_url, file_id); + if (!paths) { + console.log(`Skipping invalid URL: ${file_url}`); + this._processed++; + renderProgress(this._processed, total); + return; + } + const { dir_path, file_path } = paths; + + if (fs.existsSync(file_path)) { + this._processed++; + renderProgress(this._processed, total); + return; + } + + try { + await this._structure_dir_path(dir_path); + const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`; + let res; + try { + res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" }); + } catch (e) { + console.log(`Skipping ${file_url}, fetch failed: ${e}`); + return; + } + + if (!res.ok || !res.body) { + console.log(`Skipping ${file_url}, bad response ${res.status}`); + return; + } + + await new Promise((resolve, reject) => { + const ws = fs.createWriteStream(file_path); + Readable.fromWeb(res.body).pipe(ws); + ws.on("finish", resolve); + ws.on("error", reject); + }); + + const contentType = res.headers.get("content-type"); + const ext = path.extname(file_path).toLowerCase(); + const looksHtml = + isHtmlFile(file_path, contentType, null) || + ext === "" || + ext === ".html" || + ext === ".htm"; + if (looksHtml) { + await this._process_html_assets(file_path, file_url, file_timestamp); + } + } catch (e) { + console.log(`Download failed for ${file_url}: ${e}`); + } finally { + this._processed++; + renderProgress(this._processed, total); + } + } + + async download_files() { + const startTime = Date.now(); + console.log( + `Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.` + ); + const list = await this.get_file_list_by_timestamp(); + if (list.length === 0) { + console.log("No files to download."); + return; + } + + const concurrency = + this.threads_count && this.threads_count > 0 ? this.threads_count : 1; + const limit = pLimit(concurrency); + this._processed = 0; + await Promise.all( + list.map((info) => limit(() => this._download_single(info, list.length))) + ); + const endTime = Date.now(); + console.log( + `\nDownload completed in ${((endTime - startTime) / 1000).toFixed( + 2 + )}s, saved in ${this.backup_path()} (${list.length} files)` + ); + } +} + +// ============================= INTERACTIVE RUN ============================= +function ask(rl, question) { + return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim()))); +} + +async function interactiveMain() { + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout, + }); + + let base_url; + while (true) { + base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): "); + if (!base_url) continue; + try { + new URL(base_url); + break; + } catch { + console.log("Please enter a valid URL.\n"); + } + } + + const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: "); + const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: "); + + let rewrite_mode = "as-is"; + const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): "); + if (/^y(es)?$/i.test(m)) rewrite_mode = "relative"; + + let canonical_action = "keep"; + if (rewrite_mode === "relative") { + const c = await ask(rl, 'Canonical: "keep" (default) or "remove": '); + if ((c || "").toLowerCase() === "remove") canonical_action = "remove"; + } + + let threads_count = await ask(rl, "How many download threads? (default 3): "); + threads_count = parseInt(threads_count || "3", 10); + if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3; + + const exact_url = /^y(es)?$/i.test( + await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ") + ); + const directory = await ask( + rl, + "Target directory (leave blank for default websites//): " + ); + + const ext = await ask(rl, "Download external assets? (yes/no, default no): "); + const download_external_assets = /^y(es)?$/i.test(ext); + + rl.close(); + + const dl = new WaybackMachineDownloader({ + base_url, + exact_url, + directory: directory || null, + from_timestamp: from_timestamp || 0, + to_timestamp: to_timestamp || 0, + threads_count, + rewrite_mode, + canonical_action, + download_external_assets, + }); + + await dl.download_files(); +} + +const isDirectRun = + import.meta.url === `file://${process.argv[1]}` || + import.meta.url === pathToFileURL(process.argv[1]).href; + +if (isDirectRun) { + interactiveMain().catch((err) => { + console.error(`FATAL: ${err?.stack || err}`); + process.exit(1); + }); +} + +export { WaybackMachineDownloader }; diff --git a/wayback-machine-downloader/package.json b/wayback-machine-downloader/package.json new file mode 100644 index 0000000..144a2b6 --- /dev/null +++ b/wayback-machine-downloader/package.json @@ -0,0 +1,35 @@ +{ + "name": "wayback-downloader", + "version": "0.1.0", + "description": "Interactive Wayback Machine downloader for archiving websites locally.", + "type": "module", + "main": "downloader.js", + "bin": { + "wayback-downloader": "downloader.js" + }, + "scripts": { + "start": "node downloader.js" + }, + "dependencies": { + "cheerio": "^1.0.0-rc.12", + "p-limit": "^4.0.0" + }, + "engines": { + "node": ">=18" + }, + "keywords": [ + "wayback-machine-downloader", + "web-archive-downloder", + "archiver" + ], + "author": "birbwatcher", + "license": "MIT", + "repository": { + "type": "git", + "url": "https://github.com/birbwatcher/wayback-downloader.git" + }, + "bugs": { + "url": "https://github.com/birbwatcher/wayback-downloader/issues" + }, + "homepage": "https://github.com/birbwatcher/wayback-downloader#readme" +}