init: initial project setup

This commit is contained in:
User
2025-09-27 16:41:13 +02:00
parent dbd2f0adda
commit 7541f2ebee
4 changed files with 656 additions and 1 deletions

114
README.md
View File

@@ -1 +1,113 @@
# wayback-machine-downloader
# Wayback Machine Downloader JS
![Web Achive Website Downloader](assets/webarchive-downloader.jpg)
A script written in **Node.js** for downloading websites from [Web Archive](https://web.archive.org/).
Intended for use by:
- **Webmasters** — to restore their lost or hacked projects
- **OSINT researchers** — for local work with resources that no longer exist
This webarchive website downloader has an interactive interface, supports downloading with either original links preserved or rewritten into relative ones (for local usage).
---
## Features of Web Archive Website Downloader
1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets.
2. Rewrite internal links for correct local browsing.
3. Multithreading support.
4. Save results into a chosen folder while keeping the original structure.
5. Ability to download external assets (e.g., images or scripts from a CDN).
#### Special Features
- The script fixes parameterized file names such as `main.css?ver=1.2` into `main.css` for proper local work.
---
## Requirements
- Node.js version 18.x or higher
---
## Installation
```bash
git clone https://github.com/birbwatcher/wayback-machine-downloader.git
cd wayback-machine-downloader
# Install dependencies
npm install
```
---
## Run
```bash
node downloader.js
```
After launching, an interactive menu will appear with the following questions:
- base URL (e.g., https://example.com)
- date range (from/to)
- number of threads
- link rewriting mode (keep as-is or convert to relative)
- whether to remove `rel=canonical` from the downloaded site
- whether to download external assets
- directory for saving the files
---
## Example
```bash
node downloader.js
```
Dialog example:
```bash
Enter base URL to archive (e.g., https://example.com): https://example.com
From timestamp (YYYYMMDDhhmmss) or leave blank: 20200101000000
To timestamp (YYYYMMDDhhmmss) or leave blank: 20201231235959
Rewrite links? (yes=relative / no=as-is, default no): yes
Canonical: "keep" (default) or "remove": keep
How many download threads? (default 3): 5
Only exact URL (no wildcard /*)? (yes/no, default no): no
Target directory (leave blank for default websites/<host>/):
Download external assets? (yes/no, default no): no
```
After this, the archive download will begin.
---
## Common Issues
#### Script downloads only the homepage
**Answer:** try specifying the base URL with `/*` at the end.
For example: `https://example.com/*`, or try downloading a different time range.
---
## (Important) Download responsibly
Please note that downloading third-party websites may violate copyright laws.
Use this tool responsibly and make sure not to break the law.
---
## Contributing
Pull requests are welcome!
For major changes, please open an issue first to discuss what you would like to change.
1. Fork the project
2. Create your feature branch (`git checkout -b feature/fooBar`)
3. Commit your changes (`git commit -am 'Add some fooBar'`)
4. Push to the branch (`git push origin feature/fooBar`)
5. Create a new Pull Request

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

View File

@@ -0,0 +1,508 @@
/*
* Wayback Machine Downloader 0.1 by WhitelightSEO — Interactive (Node.js, ESM)
* Run: node downloader.js
*/
import fs from "fs";
import path from "path";
import { fileURLToPath, pathToFileURL } from "url";
import { mkdir } from "fs/promises";
import pLimit from "p-limit";
import { load } from "cheerio";
import { Readable } from "stream";
import readline from "readline";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// ----------------------------- PROGRESS BAR -----------------------------
function renderProgress(current, total) {
const width = 40;
const ratio = total > 0 ? current / total : 0;
const filled = Math.round(ratio * width);
const bar = "█".repeat(filled) + "-".repeat(width - filled);
process.stdout.write(
`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
);
if (current === total) process.stdout.write("\n");
}
// ----------------------------- HELPERS -----------------------------
function toPosix(p) {
return p.split(path.sep).join("/");
}
function relativeLink(fromDir, toFile) {
const rel = path.relative(fromDir, toFile);
return toPosix(rel || path.basename(toFile));
}
function ensureLocalTargetForPath(pathname) {
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
? path.posix.join(pathname, "index.html")
: pathname;
}
// ----------------------------- HTML CHECK -----------------------------
function isHtmlFile(filePath, contentType, firstBytes) {
if (contentType && /text\/html/i.test(String(contentType))) return true;
const ext = path.extname(filePath).toLowerCase();
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
const head = (firstBytes || "").toString("utf8", 0, 512);
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
}
// ----------------------------- Archive API -----------------------------
async function getRawListFromApi({
baseUrl,
pageIndex,
all,
fromTimestamp,
toTimestamp,
}) {
const cdx = new URL("https://web.archive.org/cdx/search/xd");
const params = new URLSearchParams();
params.set("output", "json");
params.set("url", baseUrl);
params.set("fl", "timestamp,original");
params.set("collapse", "digest");
params.set("gzip", "false");
if (!all) params.append("filter", "statuscode:200");
if (fromTimestamp && Number(fromTimestamp) !== 0)
params.set("from", String(fromTimestamp));
if (toTimestamp && Number(toTimestamp) !== 0)
params.set("to", String(toTimestamp));
if (pageIndex != null) params.set("page", String(pageIndex));
cdx.search = params.toString();
try {
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
const text = await res.text();
const json = JSON.parse(text);
if (
Array.isArray(json) &&
Array.isArray(json[0]) &&
json[0].join(",") === "timestamp,original"
) {
json.shift();
}
return json || [];
} catch (e) {
console.log(`ERROR getRawListFromApi: ${e}`);
return [];
}
}
// ----------------------------- DOWNLOADER CLASS -----------------------------
class WaybackMachineDownloader {
constructor(params) {
this.base_url = params.base_url;
this.exact_url = !!params.exact_url;
this.directory = params.directory || null;
this.from_timestamp = params.from_timestamp
? Number(params.from_timestamp)
: 0;
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
this.threads_count =
params.threads_count != null ? Number(params.threads_count) : 3;
this.download_external_assets = params.download_external_assets || false;
this.rewrite_mode = params.rewrite_mode || "as-is";
this.rewrite_links = this.rewrite_mode === "relative";
this.canonical_action = params.canonical_action || "keep";
this._processed = 0;
}
backup_name() {
try {
if (this.base_url.includes("//")) {
const u = new URL(this.base_url);
return u.host;
}
} catch {}
return this.base_url;
}
backup_path() {
if (this.directory) {
return this.directory.endsWith(path.sep)
? this.directory
: this.directory + path.sep;
}
return path.join("websites", this.backup_name(), path.sep);
}
async get_all_snapshots_to_consider() {
console.log("Getting snapshot pages");
const httpOpts = {
all: true,
fromTimestamp: this.from_timestamp,
toTimestamp: this.to_timestamp,
};
let list = [];
list = list.concat(
await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
);
process.stdout.write(".");
if (!this.exact_url) {
const wildcard = this.base_url.endsWith("/*")
? this.base_url
: this.base_url.replace(/\/*$/, "") + "/*";
for (let i = 0; i < 100; i++) {
const batch = await getRawListFromApi({
baseUrl: wildcard,
pageIndex: i,
...httpOpts,
});
if (!batch || batch.length === 0) break;
list = list.concat(batch);
process.stdout.write(".");
}
}
console.log(` found ${list.length} snapshots to consider.\n`);
return list;
}
async get_file_list_by_timestamp() {
const curated = new Map();
const all = await this.get_all_snapshots_to_consider();
for (const pair of all) {
const ts = pair[0];
const url = pair[1];
try {
const u = new URL(url);
const file_id = u.pathname;
const prev = curated.get(file_id);
if (!prev || prev.timestamp <= ts) {
curated.set(file_id, { file_url: url, timestamp: ts, file_id });
}
} catch {}
}
const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id }));
arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
return arr;
}
_windowsSanitize(p) {
if (process.platform !== "win32") return p;
return p.replace(/[:*?&=<>\\|]/g, (s) =>
"%" + s.charCodeAt(0).toString(16)
);
}
async _structure_dir_path(dir_path) {
try {
await mkdir(dir_path, { recursive: true });
} catch (e) {
if (!e || e.code !== "EEXIST") throw e;
}
}
_determine_paths(file_url, file_id) {
if (file_url.startsWith("data:") || file_url.startsWith("javascript:"))
return null;
if (file_id.length > 200) return null;
const backup = this.backup_path();
const parts = file_id.split("/").filter(Boolean);
let dir_path, file_path;
if (file_id === "") {
dir_path = backup;
file_path = path.join(backup, "index.html");
} else if (
file_url.endsWith("/") ||
!parts[parts.length - 1].includes(".")
) {
dir_path = path.join(backup, ...parts);
file_path = path.join(dir_path, "index.html");
} else {
dir_path = path.join(backup, ...parts.slice(0, -1));
file_path = path.join(backup, ...parts);
}
dir_path = this._windowsSanitize(dir_path);
file_path = this._windowsSanitize(file_path);
return { dir_path, file_path };
}
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
try {
if (fs.existsSync(file_path)) return file_path;
await this._structure_dir_path(dir_path);
const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`;
let res;
try {
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
} catch (e) {
console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
return null;
}
if (!res.ok || !res.body) {
console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`);
return null;
}
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(file_path);
Readable.fromWeb(res.body).pipe(ws);
ws.on("finish", resolve);
ws.on("error", reject);
});
return file_path;
} catch (e) {
console.log(`Asset download failed: ${assetUrl}${e}`);
return null;
}
}
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
try {
const backupRoot = this.backup_path();
let html = fs.readFileSync(htmlPath, "utf8");
const $ = load(html);
const site = new URL(this.base_url);
const siteHost = site.hostname.replace(/^www\./, "");
const baseDir = path.dirname(htmlPath);
const downloadTasks = [];
// ----------- ASSETS -----------
$(
"img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
).each((_, el) => {
const attr = el.tagName === "link" ? "href" : "src";
const val = $(el).attr(attr);
if (!val) return;
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const isInternal = u.hostname.replace(/^www\./, "") === siteHost;
if (isInternal || this.download_external_assets) {
const file_id = u.pathname;
const paths = this._determine_paths(abs, file_id);
if (!paths) return;
const { dir_path, file_path } = paths;
if (this.rewrite_links) {
const normPath = u.pathname + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
}
if (!fs.existsSync(file_path)) {
downloadTasks.push(
this._download_asset(abs, pageTimestamp, file_path, dir_path)
);
}
}
} catch {}
});
// ----------- INTERNAL LINKS (pages/forms) -----------
if (this.rewrite_links) {
$("a[href], form[action]").each((_, el) => {
const attr = el.tagName === "a" ? "href" : "action";
const val = $(el).attr(attr);
if (!val) return;
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const isInternal = u.hostname.replace(/^www\./, "") === siteHost;
if (isInternal) {
const normPath = u.pathname + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
}
} catch {}
});
}
await Promise.all(downloadTasks);
if (this.canonical_action === "remove") {
$("link[rel=\"canonical\"]").remove();
}
fs.writeFileSync(htmlPath, $.html(), "utf8");
} catch (e) {
console.log(`HTML processing error: ${e}`);
}
}
async _download_single(file_remote_info, total) {
const file_url = String(file_remote_info.file_url);
const file_id = file_remote_info.file_id;
const file_timestamp = file_remote_info.timestamp;
const paths = this._determine_paths(file_url, file_id);
if (!paths) {
console.log(`Skipping invalid URL: ${file_url}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
const { dir_path, file_path } = paths;
if (fs.existsSync(file_path)) {
this._processed++;
renderProgress(this._processed, total);
return;
}
try {
await this._structure_dir_path(dir_path);
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
let res;
try {
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
} catch (e) {
console.log(`Skipping ${file_url}, fetch failed: ${e}`);
return;
}
if (!res.ok || !res.body) {
console.log(`Skipping ${file_url}, bad response ${res.status}`);
return;
}
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(file_path);
Readable.fromWeb(res.body).pipe(ws);
ws.on("finish", resolve);
ws.on("error", reject);
});
const contentType = res.headers.get("content-type");
const ext = path.extname(file_path).toLowerCase();
const looksHtml =
isHtmlFile(file_path, contentType, null) ||
ext === "" ||
ext === ".html" ||
ext === ".htm";
if (looksHtml) {
await this._process_html_assets(file_path, file_url, file_timestamp);
}
} catch (e) {
console.log(`Download failed for ${file_url}: ${e}`);
} finally {
this._processed++;
renderProgress(this._processed, total);
}
}
async download_files() {
const startTime = Date.now();
console.log(
`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
);
const list = await this.get_file_list_by_timestamp();
if (list.length === 0) {
console.log("No files to download.");
return;
}
const concurrency =
this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
const limit = pLimit(concurrency);
this._processed = 0;
await Promise.all(
list.map((info) => limit(() => this._download_single(info, list.length)))
);
const endTime = Date.now();
console.log(
`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
2
)}s, saved in ${this.backup_path()} (${list.length} files)`
);
}
}
// ============================= INTERACTIVE RUN =============================
function ask(rl, question) {
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
}
async function interactiveMain() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
let base_url;
while (true) {
base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): ");
if (!base_url) continue;
try {
new URL(base_url);
break;
} catch {
console.log("Please enter a valid URL.\n");
}
}
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
let rewrite_mode = "as-is";
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
let canonical_action = "keep";
if (rewrite_mode === "relative") {
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
}
let threads_count = await ask(rl, "How many download threads? (default 3): ");
threads_count = parseInt(threads_count || "3", 10);
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
const exact_url = /^y(es)?$/i.test(
await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")
);
const directory = await ask(
rl,
"Target directory (leave blank for default websites/<host>/): "
);
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
const download_external_assets = /^y(es)?$/i.test(ext);
rl.close();
const dl = new WaybackMachineDownloader({
base_url,
exact_url,
directory: directory || null,
from_timestamp: from_timestamp || 0,
to_timestamp: to_timestamp || 0,
threads_count,
rewrite_mode,
canonical_action,
download_external_assets,
});
await dl.download_files();
}
const isDirectRun =
import.meta.url === `file://${process.argv[1]}` ||
import.meta.url === pathToFileURL(process.argv[1]).href;
if (isDirectRun) {
interactiveMain().catch((err) => {
console.error(`FATAL: ${err?.stack || err}`);
process.exit(1);
});
}
export { WaybackMachineDownloader };

View File

@@ -0,0 +1,35 @@
{
"name": "wayback-downloader",
"version": "0.1.0",
"description": "Interactive Wayback Machine downloader for archiving websites locally.",
"type": "module",
"main": "downloader.js",
"bin": {
"wayback-downloader": "downloader.js"
},
"scripts": {
"start": "node downloader.js"
},
"dependencies": {
"cheerio": "^1.0.0-rc.12",
"p-limit": "^4.0.0"
},
"engines": {
"node": ">=18"
},
"keywords": [
"wayback-machine-downloader",
"web-archive-downloder",
"archiver"
],
"author": "birbwatcher",
"license": "MIT",
"repository": {
"type": "git",
"url": "https://github.com/birbwatcher/wayback-downloader.git"
},
"bugs": {
"url": "https://github.com/birbwatcher/wayback-downloader/issues"
},
"homepage": "https://github.com/birbwatcher/wayback-downloader#readme"
}