mirror of
https://github.com/birbwatcher/wayback-machine-downloader.git
synced 2026-01-29 01:40:41 +00:00
Merge pull request #8 from birbwatcher/work
feat: splitted into files and made some fixes
This commit is contained in:
@@ -84,7 +84,7 @@ Got ideas or suggestions? Feel free to open an issue!
|
||||
## Run
|
||||
|
||||
```bash
|
||||
node downloader.js
|
||||
node index.js
|
||||
```
|
||||
|
||||
After launching, an interactive menu will appear with the following questions:
|
||||
|
||||
88
wayback-machine-downloader/cli.js
Normal file
88
wayback-machine-downloader/cli.js
Normal file
@@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import path from "path";
|
||||
import readline from "readline";
|
||||
|
||||
import { WaybackMachineDownloader } from "./lib/downloader.js";
|
||||
import { normalizeBaseUrlInput } from "./lib/utils.js";
|
||||
|
||||
function ask(rl, question) {
|
||||
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
|
||||
}
|
||||
|
||||
async function interactiveMain() {
|
||||
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
||||
|
||||
let normalizedBase;
|
||||
while (true) {
|
||||
const baseInput = await ask(rl, "Enter domain or URL to archive (e.g., example.com): ");
|
||||
if (!baseInput) continue;
|
||||
try {
|
||||
normalizedBase = normalizeBaseUrlInput(baseInput);
|
||||
break;
|
||||
} catch {
|
||||
console.log("Please enter a valid domain or URL.\n");
|
||||
}
|
||||
}
|
||||
|
||||
const base_url = normalizedBase.canonicalUrl;
|
||||
|
||||
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
||||
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
||||
|
||||
let rewrite_mode = "as-is";
|
||||
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
|
||||
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
|
||||
|
||||
let canonical_action = "keep";
|
||||
if (rewrite_mode === "relative") {
|
||||
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
|
||||
if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
|
||||
}
|
||||
|
||||
let threads_count = await ask(rl, "How many download threads? (default 3): ");
|
||||
threads_count = parseInt(threads_count || "3", 10);
|
||||
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
|
||||
|
||||
const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
|
||||
const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
|
||||
|
||||
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
|
||||
const download_external_assets = /^y(es)?$/i.test(ext);
|
||||
|
||||
rl.close();
|
||||
|
||||
const dl = new WaybackMachineDownloader({
|
||||
base_url,
|
||||
normalized_base: normalizedBase,
|
||||
exact_url,
|
||||
directory: directory || null,
|
||||
from_timestamp: from_timestamp || 0,
|
||||
to_timestamp: to_timestamp || 0,
|
||||
threads_count,
|
||||
rewrite_mode,
|
||||
canonical_action,
|
||||
download_external_assets,
|
||||
});
|
||||
|
||||
await dl.download_files();
|
||||
}
|
||||
|
||||
const isDirectCliRun = (() => {
|
||||
const entryArg = process.argv && process.argv.length > 1 ? process.argv[1] : null;
|
||||
if (!entryArg) return false;
|
||||
try {
|
||||
return import.meta.url === `file://${path.resolve(entryArg)}`;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
})();
|
||||
|
||||
if (isDirectCliRun) {
|
||||
interactiveMain().catch((err) => {
|
||||
console.error(`FATAL: ${err?.stack || err}`);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
export { interactiveMain };
|
||||
@@ -7,7 +7,4 @@ RUN npm install --production
|
||||
|
||||
COPY . .
|
||||
|
||||
CMD ["node", "downloader.js"]
|
||||
|
||||
ENTRYPOINT ["node", "downloader.js"]
|
||||
|
||||
ENTRYPOINT ["node", "index.js"]
|
||||
|
||||
@@ -1,493 +0,0 @@
|
||||
/*
|
||||
* Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM)
|
||||
* Run: node downloader.js
|
||||
*/
|
||||
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { fileURLToPath, pathToFileURL, domainToUnicode } from "url";
|
||||
import { mkdir } from "fs/promises";
|
||||
import pLimit from "p-limit";
|
||||
import { load } from "cheerio";
|
||||
import { Readable } from "stream";
|
||||
import readline from "readline";
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
|
||||
// ----------------------------- PROGRESS BAR -----------------------------
|
||||
function renderProgress(current, total) {
|
||||
const width = 40;
|
||||
const ratio = total > 0 ? current / total : 0;
|
||||
const filled = Math.round(ratio * width);
|
||||
const bar = "█".repeat(filled) + "-".repeat(width - filled);
|
||||
process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
|
||||
if (current === total) process.stdout.write("\n");
|
||||
}
|
||||
|
||||
// ----------------------------- HELPERS -----------------------------
|
||||
function toPosix(p) {
|
||||
return p.split(path.sep).join("/");
|
||||
}
|
||||
function relativeLink(fromDir, toFile) {
|
||||
const rel = path.relative(fromDir, toFile);
|
||||
return toPosix(rel || path.basename(toFile));
|
||||
}
|
||||
function ensureLocalTargetForPath(pathname) {
|
||||
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
|
||||
? path.posix.join(pathname, "index.html")
|
||||
: pathname;
|
||||
}
|
||||
|
||||
// ----------------------------- HTML CHECK -----------------------------
|
||||
function isHtmlFile(filePath, contentType, firstBytes) {
|
||||
if (contentType && /text\/html/i.test(String(contentType))) return true;
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
|
||||
const head = (firstBytes || "").toString("utf8", 0, 512);
|
||||
return /<!doctype html/i.test(head) || /<html[\\s>]/i.test(head);
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------- Archive API -----------------------------
|
||||
async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
|
||||
const cdx = new URL("https://web.archive.org/cdx/search/xd");
|
||||
const params = new URLSearchParams();
|
||||
params.set("output", "json");
|
||||
params.set("url", baseUrl);
|
||||
params.set("fl", "timestamp,original");
|
||||
params.set("collapse", "digest");
|
||||
params.set("gzip", "false");
|
||||
if (!all) params.append("filter", "statuscode:200");
|
||||
if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
|
||||
if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
|
||||
if (pageIndex != null) params.set("page", String(pageIndex));
|
||||
cdx.search = params.toString();
|
||||
|
||||
try {
|
||||
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
|
||||
const text = await res.text();
|
||||
let json = [];
|
||||
try {
|
||||
json = JSON.parse(text);
|
||||
} catch {
|
||||
// silent: treat as empty page
|
||||
return [];
|
||||
}
|
||||
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
|
||||
json.shift();
|
||||
}
|
||||
return json || [];
|
||||
} catch {
|
||||
// silent: skip broken page
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------- DOWNLOADER CLASS -----------------------------
|
||||
class WaybackMachineDownloader {
|
||||
constructor(params) {
|
||||
this.base_url = params.base_url;
|
||||
this.exact_url = !!params.exact_url;
|
||||
this.directory = params.directory || null;
|
||||
this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
|
||||
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
|
||||
this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
|
||||
|
||||
this.download_external_assets = params.download_external_assets || false;
|
||||
|
||||
this.rewrite_mode = params.rewrite_mode || "as-is";
|
||||
this.rewrite_links = this.rewrite_mode === "relative";
|
||||
this.canonical_action = params.canonical_action || "keep";
|
||||
|
||||
this._processed = 0;
|
||||
}
|
||||
|
||||
// Create a human-readable backup folder name, preserving IDNs
|
||||
backup_name() {
|
||||
try {
|
||||
if (this.base_url.includes("//")) {
|
||||
const u = new URL(this.base_url);
|
||||
return domainToUnicode(u.host);
|
||||
}
|
||||
} catch {}
|
||||
return this.base_url;
|
||||
}
|
||||
|
||||
// Resolve output directory
|
||||
backup_path() {
|
||||
if (this.directory) {
|
||||
return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
|
||||
}
|
||||
return path.join("websites", this.backup_name(), path.sep);
|
||||
}
|
||||
|
||||
// Fetch and merge snapshot lists
|
||||
async get_all_snapshots_to_consider() {
|
||||
console.log("Getting snapshot pages");
|
||||
const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
|
||||
let list = [];
|
||||
|
||||
list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }));
|
||||
process.stdout.write(".");
|
||||
|
||||
if (!this.exact_url) {
|
||||
const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*";
|
||||
for (let i = 0; i < 100; i++) {
|
||||
const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
|
||||
if (!batch || batch.length === 0) break;
|
||||
list = list.concat(batch);
|
||||
process.stdout.write(".");
|
||||
}
|
||||
}
|
||||
console.log(` found ${list.length} snapshots to consider.\n`);
|
||||
return list;
|
||||
}
|
||||
|
||||
|
||||
// Choose the latest timestamp per unique pathname
|
||||
async get_file_list_by_timestamp() {
|
||||
const curated = new Map();
|
||||
const all = await this.get_all_snapshots_to_consider();
|
||||
for (const pair of all) {
|
||||
const ts = pair && pair[0];
|
||||
const url = pair && pair[1];
|
||||
if (!ts || !url) continue;
|
||||
try {
|
||||
const u = new URL(url);
|
||||
const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
|
||||
const prev = curated.get(file_id);
|
||||
if (!prev || prev.timestamp <= ts) {
|
||||
curated.set(file_id, { file_url: url, timestamp: ts, file_id });
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id }));
|
||||
arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
|
||||
return arr;
|
||||
}
|
||||
|
||||
// Replace Windows-hostile characters when running on Windows
|
||||
_windowsSanitize(p) {
|
||||
if (process.platform !== "win32") return p;
|
||||
return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
|
||||
}
|
||||
|
||||
// Ensure directory exists
|
||||
async _structure_dir_path(dir_path) {
|
||||
try {
|
||||
await mkdir(dir_path, { recursive: true });
|
||||
} catch (e) {
|
||||
if (!e || e.code !== "EEXIST") throw e;
|
||||
}
|
||||
}
|
||||
|
||||
// Compute local file paths for a given archived URL
|
||||
_determine_paths(file_url, file_id) {
|
||||
if (!file_url || !file_id) return null;
|
||||
if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null;
|
||||
if (file_id.length > 200) return null;
|
||||
|
||||
const backup = this.backup_path();
|
||||
const parts = file_id.split("/").filter(Boolean);
|
||||
let dir_path, file_path;
|
||||
|
||||
if (file_id === "") {
|
||||
dir_path = backup;
|
||||
file_path = path.join(backup, "index.html");
|
||||
} else {
|
||||
const lastPart = parts[parts.length - 1] || "";
|
||||
if (file_url.endsWith("/") || !lastPart.includes(".")) {
|
||||
dir_path = path.join(backup, ...parts);
|
||||
file_path = path.join(dir_path, "index.html");
|
||||
} else {
|
||||
dir_path = path.join(backup, ...parts.slice(0, -1));
|
||||
file_path = path.join(backup, ...parts);
|
||||
}
|
||||
}
|
||||
|
||||
dir_path = this._windowsSanitize(dir_path);
|
||||
file_path = this._windowsSanitize(file_path);
|
||||
|
||||
return { dir_path, file_path };
|
||||
}
|
||||
|
||||
|
||||
// Download a single asset (img/css/js/etc.) referenced from an HTML page
|
||||
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
|
||||
try {
|
||||
if (fs.existsSync(file_path)) return file_path;
|
||||
|
||||
await this._structure_dir_path(dir_path);
|
||||
const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`;
|
||||
let res;
|
||||
try {
|
||||
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
||||
} catch (e) {
|
||||
console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
|
||||
return null;
|
||||
}
|
||||
if (!res.ok || !res.body) {
|
||||
console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
await new Promise((resolve, reject) => {
|
||||
const ws = fs.createWriteStream(file_path);
|
||||
Readable.fromWeb(res.body).pipe(ws);
|
||||
ws.on("finish", resolve);
|
||||
ws.on("error", reject);
|
||||
});
|
||||
|
||||
return file_path;
|
||||
} catch (e) {
|
||||
console.log(`Asset download failed: ${assetUrl} → ${e}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// Parse saved HTML, optionally rewrite internal links to relative and fetch assets
|
||||
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
|
||||
try {
|
||||
const backupRoot = this.backup_path();
|
||||
let html = fs.readFileSync(htmlPath, "utf8");
|
||||
const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is
|
||||
const site = new URL(this.base_url);
|
||||
const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, ""));
|
||||
const baseDir = path.dirname(htmlPath);
|
||||
|
||||
const downloadTasks = [];
|
||||
|
||||
// ----------- ASSETS -----------
|
||||
$("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
|
||||
const attr = el.tagName === "link" ? "href" : "src";
|
||||
const val = $(el).attr(attr);
|
||||
if (!val) return;
|
||||
|
||||
try {
|
||||
const abs = new URL(val, pageUrl).toString();
|
||||
const u = new URL(abs);
|
||||
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
|
||||
|
||||
if (isInternal || this.download_external_assets) {
|
||||
const file_id = decodeURIComponent(u.pathname);
|
||||
let paths;
|
||||
try {
|
||||
paths = this._determine_paths(abs, file_id);
|
||||
} catch (e) {
|
||||
console.log(`Invalid path for asset ${abs}: ${e}`);
|
||||
return;
|
||||
}
|
||||
if (!paths) return;
|
||||
const { dir_path, file_path } = paths;
|
||||
|
||||
if (this.rewrite_links) {
|
||||
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
|
||||
const localTarget = ensureLocalTargetForPath(normPath);
|
||||
const localAbsPath = path.join(backupRoot, localTarget);
|
||||
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
||||
}
|
||||
|
||||
if (!fs.existsSync(file_path)) {
|
||||
downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path));
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
});
|
||||
|
||||
// ----------- INTERNAL LINKS (pages/forms) -----------
|
||||
if (this.rewrite_links) {
|
||||
$("a[href], form[action]").each((_, el) => {
|
||||
const attr = el.tagName === "a" ? "href" : "action";
|
||||
const val = $(el).attr(attr);
|
||||
if (!val) return;
|
||||
|
||||
try {
|
||||
const abs = new URL(val, pageUrl).toString();
|
||||
const u = new URL(abs);
|
||||
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
|
||||
|
||||
if (isInternal) {
|
||||
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
|
||||
const localTarget = ensureLocalTargetForPath(normPath);
|
||||
const localAbsPath = path.join(backupRoot, localTarget);
|
||||
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
||||
}
|
||||
} catch {}
|
||||
});
|
||||
}
|
||||
|
||||
await Promise.all(downloadTasks);
|
||||
|
||||
if (this.canonical_action === "remove") {
|
||||
$("link[rel=\"canonical\"]").remove();
|
||||
}
|
||||
|
||||
fs.writeFileSync(htmlPath, $.html(), "utf8");
|
||||
} catch (e) {
|
||||
console.log(`HTML processing error: ${e}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Download one file from the snapshot list (page or asset saved by CDX)
|
||||
async _download_single(file_remote_info, total) {
|
||||
const file_url = String(file_remote_info.file_url);
|
||||
const file_id = file_remote_info.file_id;
|
||||
const file_timestamp = file_remote_info.timestamp;
|
||||
|
||||
let paths;
|
||||
try {
|
||||
paths = this._determine_paths(file_url, file_id);
|
||||
} catch (e) {
|
||||
console.log(`Invalid path for ${file_url}: ${e}`);
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!paths) {
|
||||
console.log(`Skipping invalid URL: ${file_url}`);
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
return;
|
||||
}
|
||||
|
||||
const { dir_path, file_path } = paths;
|
||||
|
||||
if (fs.existsSync(file_path)) {
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this._structure_dir_path(dir_path);
|
||||
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
|
||||
let res;
|
||||
try {
|
||||
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
||||
} catch (e) {
|
||||
console.log(`Skipping ${file_url}, fetch failed: ${e}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!res.ok || !res.body) {
|
||||
console.log(`Skipping ${file_url}, bad response ${res.status}`);
|
||||
return;
|
||||
}
|
||||
|
||||
await new Promise((resolve, reject) => {
|
||||
const ws = fs.createWriteStream(file_path);
|
||||
Readable.fromWeb(res.body).pipe(ws);
|
||||
ws.on("finish", resolve);
|
||||
ws.on("error", reject);
|
||||
});
|
||||
|
||||
const contentType = res.headers.get("content-type");
|
||||
const ext = path.extname(file_path).toLowerCase();
|
||||
const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
|
||||
if (looksHtml) {
|
||||
await this._process_html_assets(file_path, file_url, file_timestamp);
|
||||
}
|
||||
} catch (e) {
|
||||
console.log(`Download failed for ${file_url}: ${e}`);
|
||||
} finally {
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
}
|
||||
}
|
||||
|
||||
// Orchestrate downloads with concurrency
|
||||
async download_files() {
|
||||
const startTime = Date.now();
|
||||
console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
|
||||
const list = await this.get_file_list_by_timestamp();
|
||||
if (list.length === 0) {
|
||||
console.log("No files to download.");
|
||||
return;
|
||||
}
|
||||
|
||||
const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
|
||||
const limit = pLimit(concurrency);
|
||||
this._processed = 0;
|
||||
await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
|
||||
const endTime = Date.now();
|
||||
console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ============================= INTERACTIVE RUN =============================
|
||||
function ask(rl, question) {
|
||||
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
|
||||
}
|
||||
|
||||
async function interactiveMain() {
|
||||
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
||||
|
||||
let base_url;
|
||||
while (true) {
|
||||
base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): ");
|
||||
if (!base_url) continue;
|
||||
try {
|
||||
new URL(base_url);
|
||||
break;
|
||||
} catch {
|
||||
console.log("Please enter a valid URL.\n");
|
||||
}
|
||||
}
|
||||
|
||||
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
||||
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
||||
|
||||
let rewrite_mode = "as-is";
|
||||
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
|
||||
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
|
||||
|
||||
let canonical_action = "keep";
|
||||
if (rewrite_mode === "relative") {
|
||||
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
|
||||
if ((c || '').toLowerCase() === "remove") canonical_action = "remove";
|
||||
}
|
||||
|
||||
let threads_count = await ask(rl, "How many download threads? (default 3): ");
|
||||
threads_count = parseInt(threads_count || "3", 10);
|
||||
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
|
||||
|
||||
const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
|
||||
const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
|
||||
|
||||
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
|
||||
const download_external_assets = /^y(es)?$/i.test(ext);
|
||||
|
||||
rl.close();
|
||||
|
||||
const dl = new WaybackMachineDownloader({
|
||||
base_url,
|
||||
exact_url,
|
||||
directory: directory || null,
|
||||
from_timestamp: from_timestamp || 0,
|
||||
to_timestamp: to_timestamp || 0,
|
||||
threads_count,
|
||||
rewrite_mode,
|
||||
canonical_action,
|
||||
download_external_assets,
|
||||
});
|
||||
|
||||
await dl.download_files();
|
||||
}
|
||||
|
||||
const isDirectRun =
|
||||
import.meta.url === `file://${process.argv[1]}` ||
|
||||
import.meta.url === pathToFileURL(process.argv[1]).href;
|
||||
|
||||
if (isDirectRun) {
|
||||
interactiveMain().catch((err) => {
|
||||
console.error(`FATAL: ${err?.stack || err}`);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
export { WaybackMachineDownloader };
|
||||
39
wayback-machine-downloader/index.js
Normal file
39
wayback-machine-downloader/index.js
Normal file
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Wayback Machine Downloader 0.3.0 by WhitelightSEO
|
||||
* Run: node index.js
|
||||
*/
|
||||
|
||||
import { pathToFileURL } from "url";
|
||||
|
||||
import { setDebugMode, getDebugMode, debugLog } from "./lib/logger.js";
|
||||
import { WaybackMachineDownloader } from "./lib/downloader.js";
|
||||
|
||||
const DEBUG_MODE = false;
|
||||
setDebugMode(DEBUG_MODE);
|
||||
|
||||
const isDirectRun = (() => {
|
||||
const entryArg = process.argv && process.argv.length > 1 ? process.argv[1] : null;
|
||||
if (!entryArg) return false;
|
||||
|
||||
if (import.meta.url === `file://${entryArg}`) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
return import.meta.url === pathToFileURL(entryArg).href;
|
||||
} catch (e) {
|
||||
debugLog(`Failed to resolve entry script URL: ${e}`);
|
||||
return false;
|
||||
}
|
||||
})();
|
||||
|
||||
if (isDirectRun) {
|
||||
import("./cli.js")
|
||||
.then(({ interactiveMain }) => interactiveMain())
|
||||
.catch((err) => {
|
||||
console.error(`FATAL: ${err?.stack || err}`);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
export { WaybackMachineDownloader, DEBUG_MODE, setDebugMode, getDebugMode };
|
||||
392
wayback-machine-downloader/lib/asset-manager.js
Normal file
392
wayback-machine-downloader/lib/asset-manager.js
Normal file
@@ -0,0 +1,392 @@
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { mkdir } from "fs/promises";
|
||||
import { load } from "cheerio";
|
||||
import { Readable } from "stream";
|
||||
import { domainToUnicode } from "url";
|
||||
|
||||
import { debugLog } from "./logger.js";
|
||||
import {
|
||||
relativeLink,
|
||||
ensureLocalTargetForPath,
|
||||
isCssResource,
|
||||
} from "./utils.js";
|
||||
|
||||
class AssetManager {
|
||||
constructor({
|
||||
backupPathResolver,
|
||||
rewriteLinks,
|
||||
canonicalAction,
|
||||
downloadExternalAssets,
|
||||
baseHostUnicode,
|
||||
snapshotIndex,
|
||||
}) {
|
||||
this.backupPathResolver = backupPathResolver;
|
||||
this.rewriteLinks = !!rewriteLinks;
|
||||
this.canonicalAction = canonicalAction || "keep";
|
||||
this.downloadExternalAssets = !!downloadExternalAssets;
|
||||
this.baseHostUnicode = (baseHostUnicode || "").toLowerCase();
|
||||
this.snapshotIndex = snapshotIndex || null;
|
||||
}
|
||||
|
||||
setSnapshotIndex(index) {
|
||||
this.snapshotIndex = index;
|
||||
}
|
||||
|
||||
get backupPath() {
|
||||
const resolver = this.backupPathResolver;
|
||||
return typeof resolver === "function" ? resolver() : resolver;
|
||||
}
|
||||
|
||||
windowsSanitize(p) {
|
||||
if (process.platform !== "win32") return p;
|
||||
return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
|
||||
}
|
||||
|
||||
async ensureDir(dirPath) {
|
||||
try {
|
||||
await mkdir(dirPath, { recursive: true });
|
||||
} catch (e) {
|
||||
if (!e || e.code !== "EEXIST") throw e;
|
||||
}
|
||||
}
|
||||
|
||||
determinePaths(fileUrl, fileId) {
|
||||
if (!fileUrl || !fileId) return null;
|
||||
if (fileUrl.startsWith("data:") || fileUrl.startsWith("javascript:")) return null;
|
||||
if (fileId.length > 200) return null;
|
||||
|
||||
const backup = this.backupPath;
|
||||
const parts = fileId.split("/").filter(Boolean);
|
||||
let dirPath;
|
||||
let filePath;
|
||||
|
||||
if (fileId === "") {
|
||||
dirPath = backup;
|
||||
filePath = path.join(backup, "index.html");
|
||||
} else {
|
||||
const lastPart = parts[parts.length - 1] || "";
|
||||
if (fileUrl.endsWith("/") || !lastPart.includes(".")) {
|
||||
dirPath = path.join(backup, ...parts);
|
||||
filePath = path.join(dirPath, "index.html");
|
||||
} else {
|
||||
dirPath = path.join(backup, ...parts.slice(0, -1));
|
||||
filePath = path.join(backup, ...parts);
|
||||
}
|
||||
}
|
||||
|
||||
dirPath = this.windowsSanitize(dirPath);
|
||||
filePath = this.windowsSanitize(filePath);
|
||||
|
||||
return { dirPath, filePath };
|
||||
}
|
||||
|
||||
resolveAssetTimestamp(assetUrl, fallbackTimestamp) {
|
||||
if (!this.snapshotIndex) return fallbackTimestamp || 0;
|
||||
return this.snapshotIndex.resolve(assetUrl, fallbackTimestamp);
|
||||
}
|
||||
|
||||
async downloadAsset(assetUrl, pageTimestamp, filePath, dirPath) {
|
||||
try {
|
||||
if (fs.existsSync(filePath)) return filePath;
|
||||
|
||||
await this.ensureDir(dirPath);
|
||||
const assetTimestamp = this.resolveAssetTimestamp(assetUrl, pageTimestamp);
|
||||
if (!assetTimestamp) {
|
||||
debugLog(`Skipping asset ${assetUrl}, no timestamp available in range.`);
|
||||
return null;
|
||||
}
|
||||
const snapshotUrl = `https://web.archive.org/web/${assetTimestamp}id_/${assetUrl}`;
|
||||
let res;
|
||||
try {
|
||||
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
||||
} catch (e) {
|
||||
debugLog(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
|
||||
return null;
|
||||
}
|
||||
if (!res.ok || !res.body) {
|
||||
debugLog(`Skipping asset ${assetUrl}, bad response ${res.status}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
const contentType = res.headers.get("content-type") || "";
|
||||
|
||||
await new Promise((resolve, reject) => {
|
||||
const ws = fs.createWriteStream(filePath);
|
||||
Readable.fromWeb(res.body).pipe(ws);
|
||||
ws.on("finish", resolve);
|
||||
ws.on("error", reject);
|
||||
});
|
||||
|
||||
if (this.rewriteLinks && isCssResource(filePath, assetUrl, contentType)) {
|
||||
await this.rewriteCssFile(filePath, assetUrl, assetTimestamp);
|
||||
}
|
||||
|
||||
return filePath;
|
||||
} catch (e) {
|
||||
debugLog(`Asset download failed: ${assetUrl} → ${e}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
async rewriteCssContent(cssContent, cssSourceUrl, pageTimestamp, { baseDir, excludePath } = {}) {
|
||||
if (!this.rewriteLinks) {
|
||||
return { css: cssContent, downloads: [] };
|
||||
}
|
||||
|
||||
if (!cssContent || !cssContent.trim()) {
|
||||
return { css: cssContent, downloads: [] };
|
||||
}
|
||||
|
||||
const siteHost = this.baseHostUnicode;
|
||||
const downloads = [];
|
||||
const seenPaths = new Set();
|
||||
let updatedContent = cssContent;
|
||||
let cssChanged = false;
|
||||
|
||||
const processReference = (rawValue) => {
|
||||
if (!rawValue) return null;
|
||||
const trimmed = rawValue.trim();
|
||||
if (!trimmed) return null;
|
||||
if (/^(data:|javascript:|#)/i.test(trimmed)) return null;
|
||||
|
||||
let absoluteUrl;
|
||||
try {
|
||||
absoluteUrl = new URL(trimmed, cssSourceUrl).toString();
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = new URL(absoluteUrl);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
if (!/^https?:$/i.test(parsed.protocol)) return null;
|
||||
|
||||
const normalizedHost = domainToUnicode(parsed.hostname.replace(/^www\./, "")).toLowerCase();
|
||||
const isInternal = normalizedHost === siteHost;
|
||||
if (!isInternal && !this.downloadExternalAssets) return null;
|
||||
|
||||
let fileId;
|
||||
try {
|
||||
fileId = decodeURIComponent(parsed.pathname);
|
||||
} catch {
|
||||
fileId = parsed.pathname;
|
||||
}
|
||||
let paths;
|
||||
try {
|
||||
paths = this.determinePaths(absoluteUrl, fileId);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
if (!paths) return null;
|
||||
|
||||
const { dirPath, filePath } = paths;
|
||||
const assetTimestamp = this.resolveAssetTimestamp(absoluteUrl, pageTimestamp);
|
||||
|
||||
if (
|
||||
filePath &&
|
||||
(!excludePath || path.resolve(filePath) !== path.resolve(excludePath))
|
||||
) {
|
||||
const key = path.resolve(filePath);
|
||||
if (!fs.existsSync(filePath) && !seenPaths.has(key)) {
|
||||
seenPaths.add(key);
|
||||
downloads.push(this.downloadAsset(absoluteUrl, assetTimestamp, filePath, dirPath));
|
||||
}
|
||||
}
|
||||
|
||||
const relativeBase = baseDir || path.dirname(filePath);
|
||||
const relativePath = relativeLink(relativeBase, filePath) + (parsed.hash || "");
|
||||
|
||||
return {
|
||||
original: trimmed,
|
||||
replacement: relativePath,
|
||||
};
|
||||
};
|
||||
|
||||
const urlPattern = /url\(\s*(['"]?)([^'")]+)\1\s*\)/gi;
|
||||
updatedContent = updatedContent.replace(urlPattern, (match, quote, value) => {
|
||||
const info = processReference(value);
|
||||
if (!info) return match;
|
||||
if (info.replacement === info.original) return match;
|
||||
cssChanged = true;
|
||||
const q = quote || "";
|
||||
return `url(${q}${info.replacement}${q})`;
|
||||
});
|
||||
|
||||
const importPattern = /@import\s+(?!url\()\s*(['"])([^'"]+)\1/gi;
|
||||
updatedContent = updatedContent.replace(importPattern, (match, quote, value) => {
|
||||
const info = processReference(value);
|
||||
if (!info) return match;
|
||||
if (info.replacement === info.original) return match;
|
||||
cssChanged = true;
|
||||
return match.replace(value, info.replacement);
|
||||
});
|
||||
|
||||
return {
|
||||
css: cssChanged && updatedContent !== cssContent ? updatedContent : cssContent,
|
||||
downloads,
|
||||
};
|
||||
}
|
||||
|
||||
async rewriteCssFile(cssPath, cssSourceUrl, pageTimestamp) {
|
||||
if (!this.rewriteLinks) return;
|
||||
|
||||
let cssContent;
|
||||
try {
|
||||
cssContent = fs.readFileSync(cssPath, "utf8");
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
|
||||
const cssDir = path.dirname(cssPath);
|
||||
const { css: updatedContent, downloads } = await this.rewriteCssContent(
|
||||
cssContent,
|
||||
cssSourceUrl,
|
||||
pageTimestamp,
|
||||
{
|
||||
baseDir: cssDir,
|
||||
excludePath: cssPath,
|
||||
}
|
||||
);
|
||||
|
||||
if (downloads.length > 0) {
|
||||
await Promise.all(downloads);
|
||||
}
|
||||
|
||||
if (updatedContent !== cssContent) {
|
||||
fs.writeFileSync(cssPath, updatedContent, "utf8");
|
||||
}
|
||||
}
|
||||
|
||||
async processHtml(htmlPath, pageUrl, pageTimestamp) {
|
||||
try {
|
||||
let html = fs.readFileSync(htmlPath, "utf8");
|
||||
const $ = load(html, { decodeEntities: false });
|
||||
const siteHost = this.baseHostUnicode;
|
||||
const baseDir = path.dirname(htmlPath);
|
||||
const backupRoot = this.backupPath;
|
||||
|
||||
const downloadTasks = [];
|
||||
|
||||
const handleCssFragment = async (cssText) => {
|
||||
const { css: updatedCss, downloads } = await this.rewriteCssContent(
|
||||
cssText,
|
||||
pageUrl,
|
||||
pageTimestamp,
|
||||
{ baseDir }
|
||||
);
|
||||
if (downloads.length > 0) {
|
||||
downloadTasks.push(...downloads);
|
||||
}
|
||||
return updatedCss;
|
||||
};
|
||||
|
||||
$("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
|
||||
const attr = el.tagName === "link" ? "href" : "src";
|
||||
const val = $(el).attr(attr);
|
||||
if (!val) return;
|
||||
|
||||
try {
|
||||
const abs = new URL(val, pageUrl).toString();
|
||||
const u = new URL(abs);
|
||||
const normalizedHost = domainToUnicode(u.hostname.replace(/^www\./, "")).toLowerCase();
|
||||
const isInternal = normalizedHost === siteHost;
|
||||
|
||||
if (isInternal || this.downloadExternalAssets) {
|
||||
let fileId;
|
||||
try {
|
||||
fileId = decodeURIComponent(u.pathname);
|
||||
} catch {
|
||||
fileId = u.pathname;
|
||||
}
|
||||
let paths;
|
||||
try {
|
||||
paths = this.determinePaths(abs, fileId);
|
||||
} catch (e) {
|
||||
console.log(`Invalid path for asset ${abs}: ${e}`);
|
||||
return;
|
||||
}
|
||||
if (!paths) return;
|
||||
const { dirPath, filePath } = paths;
|
||||
|
||||
if (this.rewriteLinks) {
|
||||
const normPath = fileId + (u.hash || "");
|
||||
const localTarget = ensureLocalTargetForPath(normPath);
|
||||
const localAbsPath = path.join(backupRoot, localTarget);
|
||||
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
||||
}
|
||||
|
||||
if (!fs.existsSync(filePath)) {
|
||||
downloadTasks.push(
|
||||
this.downloadAsset(abs, pageTimestamp, filePath, dirPath)
|
||||
);
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
});
|
||||
|
||||
const styleNodes = $("style").toArray();
|
||||
for (const node of styleNodes) {
|
||||
const cssText = $(node).html();
|
||||
if (!cssText) continue;
|
||||
const updated = await handleCssFragment(cssText);
|
||||
if (updated !== cssText) {
|
||||
$(node).text(updated);
|
||||
}
|
||||
}
|
||||
|
||||
const inlineStyled = $("[style]").toArray();
|
||||
for (const node of inlineStyled) {
|
||||
const styleAttr = $(node).attr("style");
|
||||
if (!styleAttr) continue;
|
||||
const updated = await handleCssFragment(styleAttr);
|
||||
if (updated !== styleAttr) {
|
||||
$(node).attr("style", updated);
|
||||
}
|
||||
}
|
||||
|
||||
if (this.rewriteLinks) {
|
||||
$("a[href], form[action]").each((_, el) => {
|
||||
const attr = el.tagName === "a" ? "href" : "action";
|
||||
const val = $(el).attr(attr);
|
||||
if (!val) return;
|
||||
|
||||
try {
|
||||
const abs = new URL(val, pageUrl).toString();
|
||||
const u = new URL(abs);
|
||||
const normalizedHost = domainToUnicode(u.hostname.replace(/^www\./, "")).toLowerCase();
|
||||
const isInternal = normalizedHost === siteHost;
|
||||
|
||||
if (isInternal) {
|
||||
let normPath;
|
||||
try {
|
||||
normPath = decodeURIComponent(u.pathname);
|
||||
} catch {
|
||||
normPath = u.pathname;
|
||||
}
|
||||
normPath += u.hash || "";
|
||||
const localTarget = ensureLocalTargetForPath(normPath);
|
||||
const localAbsPath = path.join(backupRoot, localTarget);
|
||||
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
||||
}
|
||||
} catch {}
|
||||
});
|
||||
}
|
||||
|
||||
await Promise.all(downloadTasks);
|
||||
|
||||
if (this.canonicalAction === "remove") {
|
||||
$("link[rel=\"canonical\"]").remove();
|
||||
}
|
||||
|
||||
fs.writeFileSync(htmlPath, $.html(), "utf8");
|
||||
} catch (e) {
|
||||
console.log(`HTML processing error: ${e}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export { AssetManager };
|
||||
222
wayback-machine-downloader/lib/downloader.js
Normal file
222
wayback-machine-downloader/lib/downloader.js
Normal file
@@ -0,0 +1,222 @@
|
||||
import fs from "fs";
|
||||
import path from "path";
|
||||
import { domainToUnicode } from "url";
|
||||
import pLimit from "p-limit";
|
||||
import { Readable } from "stream";
|
||||
|
||||
import { debugLog } from "./logger.js";
|
||||
import { renderProgress, normalizeBaseUrlInput, isHtmlFile, isCssResource } from "./utils.js";
|
||||
import { SnapshotIndex } from "./snapshot-index.js";
|
||||
import { AssetManager } from "./asset-manager.js";
|
||||
|
||||
async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
|
||||
const cdx = new URL("https://web.archive.org/cdx/search/xd");
|
||||
const params = new URLSearchParams();
|
||||
params.set("output", "json");
|
||||
params.set("url", baseUrl);
|
||||
params.set("fl", "timestamp,original");
|
||||
params.set("collapse", "digest");
|
||||
params.set("gzip", "false");
|
||||
if (!all) params.append("filter", "statuscode:200");
|
||||
if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
|
||||
if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
|
||||
if (pageIndex != null) params.set("page", String(pageIndex));
|
||||
cdx.search = params.toString();
|
||||
|
||||
try {
|
||||
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
|
||||
const text = await res.text();
|
||||
let json = [];
|
||||
try {
|
||||
json = JSON.parse(text);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
|
||||
json.shift();
|
||||
}
|
||||
return json || [];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
class WaybackMachineDownloader {
|
||||
constructor(params) {
|
||||
const normalized = params.normalized_base || normalizeBaseUrlInput(params.base_url);
|
||||
|
||||
this.base_url = normalized.canonicalUrl;
|
||||
this.base_variants = normalized.variants;
|
||||
this.base_host_unicode = (normalized.unicodeHost || normalized.bareHost).toLowerCase();
|
||||
|
||||
this.exact_url = !!params.exact_url;
|
||||
this.directory = params.directory || null;
|
||||
this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
|
||||
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
|
||||
this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
|
||||
|
||||
this.download_external_assets = params.download_external_assets || false;
|
||||
|
||||
this.rewrite_mode = params.rewrite_mode || "as-is";
|
||||
this.rewrite_links = this.rewrite_mode === "relative";
|
||||
this.canonical_action = params.canonical_action || "keep";
|
||||
|
||||
this._processed = 0;
|
||||
this.snapshotIndex = null;
|
||||
|
||||
this.assetManager = new AssetManager({
|
||||
backupPathResolver: () => this.backup_path(),
|
||||
rewriteLinks: this.rewrite_links,
|
||||
canonicalAction: this.canonical_action,
|
||||
downloadExternalAssets: this.download_external_assets,
|
||||
baseHostUnicode: this.base_host_unicode,
|
||||
snapshotIndex: null,
|
||||
});
|
||||
}
|
||||
|
||||
backup_name() {
|
||||
try {
|
||||
if (this.base_url.includes("//")) {
|
||||
const u = new URL(this.base_url);
|
||||
return domainToUnicode(u.host);
|
||||
}
|
||||
} catch {}
|
||||
return this.base_url;
|
||||
}
|
||||
|
||||
backup_path() {
|
||||
if (this.directory) {
|
||||
return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
|
||||
}
|
||||
return path.join("websites", this.backup_name(), path.sep);
|
||||
}
|
||||
|
||||
async get_all_snapshots_to_consider() {
|
||||
console.log("Getting snapshot pages");
|
||||
const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
|
||||
let list = [];
|
||||
const bases = this.base_variants && this.base_variants.length > 0 ? this.base_variants : [this.base_url];
|
||||
|
||||
for (const base of bases) {
|
||||
list = list.concat(await getRawListFromApi({ baseUrl: base, pageIndex: null, ...httpOpts }));
|
||||
process.stdout.write(".");
|
||||
|
||||
if (!this.exact_url) {
|
||||
const wildcard = base.endsWith("/*") ? base : base.replace(/\/*$/, "") + "/*";
|
||||
for (let i = 0; i < 100; i++) {
|
||||
const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
|
||||
if (!batch || batch.length === 0) break;
|
||||
list = list.concat(batch);
|
||||
process.stdout.write(".");
|
||||
}
|
||||
}
|
||||
}
|
||||
console.log(` found ${list.length} snapshots to consider.\n`);
|
||||
return list;
|
||||
}
|
||||
|
||||
async get_file_list_by_timestamp() {
|
||||
const index = new SnapshotIndex();
|
||||
const all = await this.get_all_snapshots_to_consider();
|
||||
for (const pair of all) {
|
||||
const ts = pair && pair[0];
|
||||
const url = pair && pair[1];
|
||||
if (!ts || !url) continue;
|
||||
index.register(url, ts);
|
||||
}
|
||||
|
||||
const manifest = index.getManifest();
|
||||
this.snapshotIndex = index;
|
||||
this.assetManager.setSnapshotIndex(index);
|
||||
return manifest;
|
||||
}
|
||||
|
||||
async _download_single(file_remote_info, total) {
|
||||
const file_url = String(file_remote_info.file_url);
|
||||
const file_id = file_remote_info.file_id;
|
||||
const file_timestamp = file_remote_info.timestamp;
|
||||
|
||||
let paths;
|
||||
try {
|
||||
paths = this.assetManager.determinePaths(file_url, file_id);
|
||||
} catch (e) {
|
||||
console.log(`Invalid path for ${file_url}: ${e}`);
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!paths) {
|
||||
console.log(`Skipping invalid URL: ${file_url}`);
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
return;
|
||||
}
|
||||
|
||||
const { dirPath, filePath } = paths;
|
||||
|
||||
if (fs.existsSync(filePath)) {
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await this.assetManager.ensureDir(dirPath);
|
||||
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
|
||||
let res;
|
||||
try {
|
||||
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
||||
} catch (e) {
|
||||
debugLog(`Skipping ${file_url}, fetch failed: ${e}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!res.ok || !res.body) {
|
||||
debugLog(`Skipping ${file_url}, bad response ${res.status}`);
|
||||
return;
|
||||
}
|
||||
|
||||
await new Promise((resolve, reject) => {
|
||||
const ws = fs.createWriteStream(filePath);
|
||||
Readable.fromWeb(res.body).pipe(ws);
|
||||
ws.on("finish", resolve);
|
||||
ws.on("error", reject);
|
||||
});
|
||||
|
||||
const contentType = res.headers.get("content-type") || "";
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
const looksHtml = isHtmlFile(filePath, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
|
||||
if (this.rewrite_links && isCssResource(filePath, file_url, contentType)) {
|
||||
await this.assetManager.rewriteCssFile(filePath, file_url, file_timestamp);
|
||||
}
|
||||
if (this.rewrite_links && looksHtml) {
|
||||
await this.assetManager.processHtml(filePath, file_url, file_timestamp);
|
||||
}
|
||||
} catch (e) {
|
||||
debugLog(`Download failed for ${file_url}: ${e}`);
|
||||
} finally {
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
}
|
||||
}
|
||||
|
||||
async download_files() {
|
||||
const startTime = Date.now();
|
||||
console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
|
||||
const list = await this.get_file_list_by_timestamp();
|
||||
if (list.length === 0) {
|
||||
console.log("No files to download.");
|
||||
return;
|
||||
}
|
||||
|
||||
const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
|
||||
const limit = pLimit(concurrency);
|
||||
this._processed = 0;
|
||||
await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
|
||||
const endTime = Date.now();
|
||||
console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
|
||||
}
|
||||
}
|
||||
|
||||
export { WaybackMachineDownloader };
|
||||
21
wayback-machine-downloader/lib/logger.js
Normal file
21
wayback-machine-downloader/lib/logger.js
Normal file
@@ -0,0 +1,21 @@
|
||||
let debugMode = false;
|
||||
|
||||
function setDebugMode(value) {
|
||||
debugMode = !!value;
|
||||
}
|
||||
|
||||
function getDebugMode() {
|
||||
return debugMode;
|
||||
}
|
||||
|
||||
function debugLog(...args) {
|
||||
if (debugMode) {
|
||||
console.log(...args);
|
||||
}
|
||||
}
|
||||
|
||||
function infoLog(...args) {
|
||||
console.log(...args);
|
||||
}
|
||||
|
||||
export { setDebugMode, getDebugMode, debugLog, infoLog };
|
||||
138
wayback-machine-downloader/lib/snapshot-index.js
Normal file
138
wayback-machine-downloader/lib/snapshot-index.js
Normal file
@@ -0,0 +1,138 @@
|
||||
class SnapshotIndex {
|
||||
constructor() {
|
||||
this.byPath = new Map();
|
||||
this.byPathAndQuery = new Map();
|
||||
this.lookupByPath = null;
|
||||
this.lookupByPathAndQuery = null;
|
||||
this.manifestCache = null;
|
||||
}
|
||||
|
||||
register(url, timestamp) {
|
||||
if (!url || !timestamp) return;
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = new URL(url);
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
|
||||
let filePath;
|
||||
try {
|
||||
filePath = decodeURIComponent(parsed.pathname);
|
||||
} catch {
|
||||
filePath = parsed.pathname;
|
||||
}
|
||||
const search = parsed.search || "";
|
||||
const queryKey = `${filePath}${search}`;
|
||||
|
||||
const normalizedTimestamp = String(timestamp);
|
||||
|
||||
const currentByPath = this.byPath.get(filePath);
|
||||
if (!currentByPath || String(currentByPath.timestamp) <= normalizedTimestamp) {
|
||||
this.byPath.set(filePath, {
|
||||
file_url: url,
|
||||
timestamp: normalizedTimestamp,
|
||||
file_id: filePath,
|
||||
});
|
||||
}
|
||||
|
||||
const currentByQuery = this.byPathAndQuery.get(queryKey);
|
||||
if (!currentByQuery || String(currentByQuery.timestamp) <= normalizedTimestamp) {
|
||||
this.byPathAndQuery.set(queryKey, {
|
||||
file_url: url,
|
||||
timestamp: normalizedTimestamp,
|
||||
file_id: filePath,
|
||||
});
|
||||
}
|
||||
|
||||
this.lookupByPath = null;
|
||||
this.lookupByPathAndQuery = null;
|
||||
this.manifestCache = null;
|
||||
}
|
||||
|
||||
buildCaches() {
|
||||
if (this.manifestCache) {
|
||||
return;
|
||||
}
|
||||
|
||||
const manifest = Array.from(this.byPath.entries()).map(([file_id, value]) => ({
|
||||
...value,
|
||||
file_id,
|
||||
}));
|
||||
|
||||
manifest.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
|
||||
|
||||
const byPath = new Map();
|
||||
const byQuery = new Map();
|
||||
|
||||
for (const entry of manifest) {
|
||||
const { file_url, file_id, timestamp } = entry;
|
||||
if (file_id && timestamp && !byPath.has(file_id)) {
|
||||
byPath.set(file_id, timestamp);
|
||||
}
|
||||
if (file_url) {
|
||||
try {
|
||||
const u = new URL(file_url);
|
||||
let decodedPath;
|
||||
try {
|
||||
decodedPath = decodeURIComponent(u.pathname);
|
||||
} catch {
|
||||
decodedPath = u.pathname;
|
||||
}
|
||||
const pathKey = `${decodedPath}${u.search || ""}`;
|
||||
if (pathKey && timestamp && !byQuery.has(pathKey)) {
|
||||
byQuery.set(pathKey, timestamp);
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
|
||||
for (const [queryKey, entry] of this.byPathAndQuery.entries()) {
|
||||
const ts = entry && entry.timestamp;
|
||||
if (!queryKey || !ts) continue;
|
||||
if (!byQuery.has(queryKey)) {
|
||||
byQuery.set(queryKey, ts);
|
||||
}
|
||||
const basePath = queryKey.replace(/\?.*$/, "");
|
||||
if (basePath && !byPath.has(basePath)) {
|
||||
byPath.set(basePath, ts);
|
||||
}
|
||||
}
|
||||
|
||||
this.manifestCache = manifest;
|
||||
this.lookupByPath = byPath;
|
||||
this.lookupByPathAndQuery = byQuery;
|
||||
}
|
||||
|
||||
getManifest() {
|
||||
this.buildCaches();
|
||||
return this.manifestCache || [];
|
||||
}
|
||||
|
||||
resolve(assetUrl, fallbackTimestamp) {
|
||||
this.buildCaches();
|
||||
let resolved = fallbackTimestamp || 0;
|
||||
if (!assetUrl) return resolved;
|
||||
|
||||
try {
|
||||
const u = new URL(assetUrl);
|
||||
let decodedPath;
|
||||
try {
|
||||
decodedPath = decodeURIComponent(u.pathname);
|
||||
} catch {
|
||||
decodedPath = u.pathname;
|
||||
}
|
||||
const queryKey = `${decodedPath}${u.search || ""}`;
|
||||
if (this.lookupByPathAndQuery && this.lookupByPathAndQuery.has(queryKey)) {
|
||||
resolved = this.lookupByPathAndQuery.get(queryKey);
|
||||
} else if (this.lookupByPath && this.lookupByPath.has(decodedPath)) {
|
||||
resolved = this.lookupByPath.get(decodedPath);
|
||||
}
|
||||
} catch {}
|
||||
|
||||
return resolved;
|
||||
}
|
||||
}
|
||||
|
||||
export { SnapshotIndex };
|
||||
117
wayback-machine-downloader/lib/utils.js
Normal file
117
wayback-machine-downloader/lib/utils.js
Normal file
@@ -0,0 +1,117 @@
|
||||
import path from "path";
|
||||
import { domainToUnicode } from "url";
|
||||
|
||||
function renderProgress(current, total) {
|
||||
const width = 40;
|
||||
const ratio = total > 0 ? current / total : 0;
|
||||
const filled = Math.round(ratio * width);
|
||||
const bar = "█".repeat(filled) + "-".repeat(width - filled);
|
||||
process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
|
||||
if (current === total) process.stdout.write("\n");
|
||||
}
|
||||
|
||||
function toPosix(p) {
|
||||
return p.split(path.sep).join("/");
|
||||
}
|
||||
|
||||
function relativeLink(fromDir, toFile) {
|
||||
const rel = path.relative(fromDir, toFile);
|
||||
return toPosix(rel || path.basename(toFile));
|
||||
}
|
||||
|
||||
function ensureLocalTargetForPath(pathname) {
|
||||
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
|
||||
? path.posix.join(pathname, "index.html")
|
||||
: pathname;
|
||||
}
|
||||
|
||||
function normalizeBaseUrlInput(input) {
|
||||
if (!input || typeof input !== "string") {
|
||||
throw new Error("Base URL must be a non-empty string");
|
||||
}
|
||||
|
||||
let raw = input.trim();
|
||||
if (!raw) {
|
||||
throw new Error("Base URL must not be empty");
|
||||
}
|
||||
|
||||
if (!/^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(raw)) {
|
||||
raw = `https://${raw}`;
|
||||
}
|
||||
|
||||
let parsed;
|
||||
try {
|
||||
parsed = new URL(raw);
|
||||
} catch (e) {
|
||||
throw new Error(`Invalid URL: ${e.message}`);
|
||||
}
|
||||
|
||||
if (!/^https?:$/i.test(parsed.protocol)) {
|
||||
throw new Error("Only http and https protocols are supported");
|
||||
}
|
||||
|
||||
const asciiHost = parsed.hostname.toLowerCase();
|
||||
if (!asciiHost) {
|
||||
throw new Error("URL must contain a hostname");
|
||||
}
|
||||
|
||||
const bareHost = asciiHost.replace(/^www\./, "");
|
||||
const unicodeHost = domainToUnicode(bareHost);
|
||||
const port = parsed.port ? `:${parsed.port}` : "";
|
||||
const basePath = parsed.pathname && parsed.pathname !== "/" ? parsed.pathname.replace(/\/+$/, "") : "";
|
||||
|
||||
const canonicalUrl = `https://${bareHost}${port}${basePath}`;
|
||||
|
||||
const hostSet = new Set([`${bareHost}${port}`]);
|
||||
if (asciiHost !== bareHost) {
|
||||
hostSet.add(`${asciiHost}${port}`);
|
||||
} else if (bareHost && bareHost.includes(".")) {
|
||||
hostSet.add(`www.${bareHost}${port}`);
|
||||
}
|
||||
|
||||
const protocols = ["https:", "http:"];
|
||||
const variants = new Set();
|
||||
for (const protocol of protocols) {
|
||||
for (const host of hostSet) {
|
||||
variants.add(`${protocol}//${host}${basePath}`);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
canonicalUrl,
|
||||
variants: Array.from(variants),
|
||||
bareHost,
|
||||
unicodeHost,
|
||||
};
|
||||
}
|
||||
|
||||
function isHtmlFile(filePath, contentType, firstBytes) {
|
||||
if (contentType && /text\/html/i.test(String(contentType))) return true;
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
|
||||
const head = (firstBytes || "").toString("utf8", 0, 512);
|
||||
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
|
||||
}
|
||||
|
||||
function isCssResource(filePath, resourceUrl, contentType) {
|
||||
const ext = path.extname(filePath || "").toLowerCase();
|
||||
if (ext === ".css") return true;
|
||||
if (contentType && /text\/css/i.test(String(contentType))) return true;
|
||||
if (resourceUrl) {
|
||||
try {
|
||||
const u = new URL(resourceUrl);
|
||||
if (/\.css(?:$|\?)/i.test(u.pathname)) return true;
|
||||
} catch {}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export {
|
||||
renderProgress,
|
||||
toPosix,
|
||||
relativeLink,
|
||||
ensureLocalTargetForPath,
|
||||
normalizeBaseUrlInput,
|
||||
isHtmlFile,
|
||||
isCssResource,
|
||||
};
|
||||
@@ -1,14 +1,22 @@
|
||||
{
|
||||
"name": "wayback-downloader",
|
||||
"name": "wayback-machine-downloader",
|
||||
"version": "0.2.1",
|
||||
"description": "Interactive Wayback Machine downloader for archiving websites locally.",
|
||||
"type": "module",
|
||||
"main": "downloader.js",
|
||||
"main": "./index.js",
|
||||
"exports": {
|
||||
".": "./index.js",
|
||||
"./downloader": "./lib/downloader.js",
|
||||
"./downloader.js": "./lib/downloader.js",
|
||||
"./cli": "./cli.js",
|
||||
"./package.json": "./package.json"
|
||||
},
|
||||
"bin": {
|
||||
"wayback-downloader": "downloader.js"
|
||||
"wayback-machine-downloader": "./cli.js"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "node downloader.js"
|
||||
"start": "node cli.js",
|
||||
"download": "node cli.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"cheerio": "^1.0.0-rc.12",
|
||||
@@ -17,19 +25,25 @@
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"files": [
|
||||
"cli.js",
|
||||
"index.js",
|
||||
"lib"
|
||||
],
|
||||
"keywords": [
|
||||
"wayback-machine-downloader",
|
||||
"web-archive-downloder",
|
||||
"archiver"
|
||||
"wayback",
|
||||
"archive",
|
||||
"downloader",
|
||||
"wayback-machine"
|
||||
],
|
||||
"author": "birbwatcher",
|
||||
"license": "MIT",
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/birbwatcher/wayback-downloader.git"
|
||||
"url": "https://github.com/birbwatcher/wayback-machine-downloader.git"
|
||||
},
|
||||
"bugs": {
|
||||
"url": "https://github.com/birbwatcher/wayback-downloader/issues"
|
||||
"url": "https://github.com/birbwatcher/wayback-machine-downloader/issues"
|
||||
},
|
||||
"homepage": "https://github.com/birbwatcher/wayback-downloader#readme"
|
||||
"homepage": "https://github.com/birbwatcher/wayback-machine-downloader#readme"
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user