Files
wayback-machine-downloader/wayback-machine-downloader/downloader.js

513 lines
16 KiB
JavaScript

/*
* Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM)
* Run: node downloader.js
*/
import fs from "fs";
import path from "path";
import { fileURLToPath, pathToFileURL, domainToUnicode } from "url";
import { mkdir } from "fs/promises";
import pLimit from "p-limit";
import { load } from "cheerio";
import { Readable } from "stream";
import readline from "readline";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// ----------------------------- PROGRESS BAR -----------------------------
function renderProgress(current, total) {
const width = 40;
const ratio = total > 0 ? current / total : 0;
const filled = Math.round(ratio * width);
const bar = "█".repeat(filled) + "-".repeat(width - filled);
process.stdout.write(
`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
);
if (current === total) process.stdout.write("\n");
}
// ----------------------------- HELPERS -----------------------------
function toPosix(p) {
return p.split(path.sep).join("/");
}
function relativeLink(fromDir, toFile) {
const rel = path.relative(fromDir, toFile);
return toPosix(rel || path.basename(toFile));
}
function ensureLocalTargetForPath(pathname) {
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
? path.posix.join(pathname, "index.html")
: pathname;
}
// ----------------------------- HTML CHECK -----------------------------
function isHtmlFile(filePath, contentType, firstBytes) {
if (contentType && /text\/html/i.test(String(contentType))) return true;
const ext = path.extname(filePath).toLowerCase();
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
const head = (firstBytes || "").toString("utf8", 0, 512);
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
}
// ----------------------------- Archive API -----------------------------
async function getRawListFromApi({
baseUrl,
pageIndex,
all,
fromTimestamp,
toTimestamp,
}) {
const cdx = new URL("https://web.archive.org/cdx/search/xd");
const params = new URLSearchParams();
params.set("output", "json");
params.set("url", baseUrl);
params.set("fl", "timestamp,original");
params.set("collapse", "digest");
params.set("gzip", "false");
if (!all) params.append("filter", "statuscode:200");
if (fromTimestamp && Number(fromTimestamp) !== 0)
params.set("from", String(fromTimestamp));
if (toTimestamp && Number(toTimestamp) !== 0)
params.set("to", String(toTimestamp));
if (pageIndex != null) params.set("page", String(pageIndex));
cdx.search = params.toString();
try {
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
const text = await res.text();
const json = JSON.parse(text);
if (
Array.isArray(json) &&
Array.isArray(json[0]) &&
json[0].join(",") === "timestamp,original"
) {
json.shift();
}
return json || [];
} catch (e) {
console.log(`ERROR getRawListFromApi: ${e}`);
return [];
}
}
// ----------------------------- DOWNLOADER CLASS -----------------------------
class WaybackMachineDownloader {
constructor(params) {
this.base_url = params.base_url;
this.exact_url = !!params.exact_url;
this.directory = params.directory || null;
this.from_timestamp = params.from_timestamp
? Number(params.from_timestamp)
: 0;
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
this.threads_count =
params.threads_count != null ? Number(params.threads_count) : 3;
this.download_external_assets = params.download_external_assets || false;
this.rewrite_mode = params.rewrite_mode || "as-is";
this.rewrite_links = this.rewrite_mode === "relative";
this.canonical_action = params.canonical_action || "keep";
this._processed = 0;
}
backup_name() {
try {
if (this.base_url.includes("//")) {
const u = new URL(this.base_url);
return domainToUnicode(u.host); // use human-readable domain
}
} catch {}
return this.base_url;
}
backup_path() {
if (this.directory) {
return this.directory.endsWith(path.sep)
? this.directory
: this.directory + path.sep;
}
return path.join("websites", this.backup_name(), path.sep);
}
async get_all_snapshots_to_consider() {
console.log("Getting snapshot pages");
const httpOpts = {
all: true,
fromTimestamp: this.from_timestamp,
toTimestamp: this.to_timestamp,
};
let list = [];
list = list.concat(
await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
);
process.stdout.write(".");
if (!this.exact_url) {
const wildcard = this.base_url.endsWith("/*")
? this.base_url
: this.base_url.replace(/\/*$/, "") + "/*";
for (let i = 0; i < 100; i++) {
const batch = await getRawListFromApi({
baseUrl: wildcard,
pageIndex: i,
...httpOpts,
});
if (!batch || batch.length === 0) break;
list = list.concat(batch);
process.stdout.write(".");
}
}
console.log(` found ${list.length} snapshots to consider.\n`);
return list;
}
async get_file_list_by_timestamp() {
const curated = new Map();
const all = await this.get_all_snapshots_to_consider();
for (const pair of all) {
const ts = pair[0];
const url = pair[1];
try {
const u = new URL(url);
const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
const prev = curated.get(file_id);
if (!prev || prev.timestamp <= ts) {
curated.set(file_id, { file_url: url, timestamp: ts, file_id });
}
} catch {}
}
const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id }));
arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
return arr;
}
_windowsSanitize(p) {
if (process.platform !== "win32") return p;
return p.replace(/[:*?&=<>\\|]/g, (s) =>
"%" + s.charCodeAt(0).toString(16)
);
}
async _structure_dir_path(dir_path) {
try {
await mkdir(dir_path, { recursive: true });
} catch (e) {
if (!e || e.code !== "EEXIST") throw e;
}
}
_determine_paths(file_url, file_id) {
if (file_url.startsWith("data:") || file_url.startsWith("javascript:"))
return null;
if (file_id.length > 200) return null;
const backup = this.backup_path();
const parts = file_id.split("/").filter(Boolean);
let dir_path, file_path;
if (file_id === "") {
dir_path = backup;
file_path = path.join(backup, "index.html");
} else if (
file_url.endsWith("/") ||
!parts[parts.length - 1].includes(".")
) {
dir_path = path.join(backup, ...parts);
file_path = path.join(dir_path, "index.html");
} else {
dir_path = path.join(backup, ...parts.slice(0, -1));
file_path = path.join(backup, ...parts);
}
dir_path = this._windowsSanitize(dir_path);
file_path = this._windowsSanitize(file_path);
return { dir_path, file_path };
}
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
try {
if (fs.existsSync(file_path)) return file_path;
await this._structure_dir_path(dir_path);
const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`;
let res;
try {
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
} catch (e) {
console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
return null;
}
if (!res.ok || !res.body) {
console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`);
return null;
}
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(file_path);
Readable.fromWeb(res.body).pipe(ws);
ws.on("finish", resolve);
ws.on("error", reject);
});
return file_path;
} catch (e) {
console.log(`Asset download failed: ${assetUrl}${e}`);
return null;
}
}
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
try {
const backupRoot = this.backup_path();
let html = fs.readFileSync(htmlPath, "utf8");
const $ = load(html);
const site = new URL(this.base_url);
const siteHost = domainToUnicode(site.hostname.replace(/^www\./, ""));
const baseDir = path.dirname(htmlPath);
const downloadTasks = [];
// ----------- ASSETS -----------
$(
"img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
).each((_, el) => {
const attr = el.tagName === "link" ? "href" : "src";
const val = $(el).attr(attr);
if (!val) return;
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const isInternal =
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
if (isInternal || this.download_external_assets) {
const file_id = decodeURIComponent(u.pathname);
const paths = this._determine_paths(abs, file_id);
if (!paths) return;
const { dir_path, file_path } = paths;
if (this.rewrite_links) {
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
}
if (!fs.existsSync(file_path)) {
downloadTasks.push(
this._download_asset(abs, pageTimestamp, file_path, dir_path)
);
}
}
} catch {}
});
// ----------- INTERNAL LINKS (pages/forms) -----------
if (this.rewrite_links) {
$("a[href], form[action]").each((_, el) => {
const attr = el.tagName === "a" ? "href" : "action";
const val = $(el).attr(attr);
if (!val) return;
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const isInternal =
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
if (isInternal) {
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
}
} catch {}
});
}
await Promise.all(downloadTasks);
if (this.canonical_action === "remove") {
$("link[rel=\"canonical\"]").remove();
}
fs.writeFileSync(htmlPath, $.html(), "utf8");
} catch (e) {
console.log(`HTML processing error: ${e}`);
}
}
async _download_single(file_remote_info, total) {
const file_url = String(file_remote_info.file_url);
const file_id = file_remote_info.file_id;
const file_timestamp = file_remote_info.timestamp;
const paths = this._determine_paths(file_url, file_id);
if (!paths) {
console.log(`Skipping invalid URL: ${file_url}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
const { dir_path, file_path } = paths;
if (fs.existsSync(file_path)) {
this._processed++;
renderProgress(this._processed, total);
return;
}
try {
await this._structure_dir_path(dir_path);
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
let res;
try {
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
} catch (e) {
console.log(`Skipping ${file_url}, fetch failed: ${e}`);
return;
}
if (!res.ok || !res.body) {
console.log(`Skipping ${file_url}, bad response ${res.status}`);
return;
}
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(file_path);
Readable.fromWeb(res.body).pipe(ws);
ws.on("finish", resolve);
ws.on("error", reject);
});
const contentType = res.headers.get("content-type");
const ext = path.extname(file_path).toLowerCase();
const looksHtml =
isHtmlFile(file_path, contentType, null) ||
ext === "" ||
ext === ".html" ||
ext === ".htm";
if (looksHtml) {
await this._process_html_assets(file_path, file_url, file_timestamp);
}
} catch (e) {
console.log(`Download failed for ${file_url}: ${e}`);
} finally {
this._processed++;
renderProgress(this._processed, total);
}
}
async download_files() {
const startTime = Date.now();
console.log(
`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
);
const list = await this.get_file_list_by_timestamp();
if (list.length === 0) {
console.log("No files to download.");
return;
}
const concurrency =
this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
const limit = pLimit(concurrency);
this._processed = 0;
await Promise.all(
list.map((info) => limit(() => this._download_single(info, list.length)))
);
const endTime = Date.now();
console.log(
`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
2
)}s, saved in ${this.backup_path()} (${list.length} files)`
);
}
}
// ============================= INTERACTIVE RUN =============================
function ask(rl, question) {
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
}
async function interactiveMain() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
let base_url;
while (true) {
base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): ");
if (!base_url) continue;
try {
new URL(base_url);
break;
} catch {
console.log("Please enter a valid URL.\n");
}
}
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
let rewrite_mode = "as-is";
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
let canonical_action = "keep";
if (rewrite_mode === "relative") {
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
}
let threads_count = await ask(rl, "How many download threads? (default 3): ");
threads_count = parseInt(threads_count || "3", 10);
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
const exact_url = /^y(es)?$/i.test(
await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")
);
const directory = await ask(
rl,
"Target directory (leave blank for default websites/<host>/): "
);
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
const download_external_assets = /^y(es)?$/i.test(ext);
rl.close();
const dl = new WaybackMachineDownloader({
base_url,
exact_url,
directory: directory || null,
from_timestamp: from_timestamp || 0,
to_timestamp: to_timestamp || 0,
threads_count,
rewrite_mode,
canonical_action,
download_external_assets,
});
await dl.download_files();
}
const isDirectRun =
import.meta.url === `file://${process.argv[1]}` ||
import.meta.url === pathToFileURL(process.argv[1]).href;
if (isDirectRun) {
interactiveMain().catch((err) => {
console.error(`FATAL: ${err?.stack || err}`);
process.exit(1);
});
}
export { WaybackMachineDownloader };