Merge pull request #8 from birbwatcher/work

feat: splitted into files and made some fixes
This commit is contained in:
WhitelightSEO
2025-10-26 10:19:56 +01:00
committed by GitHub
11 changed files with 1043 additions and 508 deletions

View File

@@ -84,7 +84,7 @@ Got ideas or suggestions? Feel free to open an issue!
## Run ## Run
```bash ```bash
node downloader.js node index.js
``` ```
After launching, an interactive menu will appear with the following questions: After launching, an interactive menu will appear with the following questions:

View File

@@ -0,0 +1,88 @@
#!/usr/bin/env node
import path from "path";
import readline from "readline";
import { WaybackMachineDownloader } from "./lib/downloader.js";
import { normalizeBaseUrlInput } from "./lib/utils.js";
function ask(rl, question) {
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
}
async function interactiveMain() {
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
let normalizedBase;
while (true) {
const baseInput = await ask(rl, "Enter domain or URL to archive (e.g., example.com): ");
if (!baseInput) continue;
try {
normalizedBase = normalizeBaseUrlInput(baseInput);
break;
} catch {
console.log("Please enter a valid domain or URL.\n");
}
}
const base_url = normalizedBase.canonicalUrl;
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
let rewrite_mode = "as-is";
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
let canonical_action = "keep";
if (rewrite_mode === "relative") {
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
}
let threads_count = await ask(rl, "How many download threads? (default 3): ");
threads_count = parseInt(threads_count || "3", 10);
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
const download_external_assets = /^y(es)?$/i.test(ext);
rl.close();
const dl = new WaybackMachineDownloader({
base_url,
normalized_base: normalizedBase,
exact_url,
directory: directory || null,
from_timestamp: from_timestamp || 0,
to_timestamp: to_timestamp || 0,
threads_count,
rewrite_mode,
canonical_action,
download_external_assets,
});
await dl.download_files();
}
const isDirectCliRun = (() => {
const entryArg = process.argv && process.argv.length > 1 ? process.argv[1] : null;
if (!entryArg) return false;
try {
return import.meta.url === `file://${path.resolve(entryArg)}`;
} catch {
return false;
}
})();
if (isDirectCliRun) {
interactiveMain().catch((err) => {
console.error(`FATAL: ${err?.stack || err}`);
process.exit(1);
});
}
export { interactiveMain };

View File

@@ -7,7 +7,4 @@ RUN npm install --production
COPY . . COPY . .
CMD ["node", "downloader.js"] ENTRYPOINT ["node", "index.js"]
ENTRYPOINT ["node", "downloader.js"]

View File

@@ -1,493 +0,0 @@
/*
* Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM)
* Run: node downloader.js
*/
import fs from "fs";
import path from "path";
import { fileURLToPath, pathToFileURL, domainToUnicode } from "url";
import { mkdir } from "fs/promises";
import pLimit from "p-limit";
import { load } from "cheerio";
import { Readable } from "stream";
import readline from "readline";
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// ----------------------------- PROGRESS BAR -----------------------------
function renderProgress(current, total) {
const width = 40;
const ratio = total > 0 ? current / total : 0;
const filled = Math.round(ratio * width);
const bar = "█".repeat(filled) + "-".repeat(width - filled);
process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
if (current === total) process.stdout.write("\n");
}
// ----------------------------- HELPERS -----------------------------
function toPosix(p) {
return p.split(path.sep).join("/");
}
function relativeLink(fromDir, toFile) {
const rel = path.relative(fromDir, toFile);
return toPosix(rel || path.basename(toFile));
}
function ensureLocalTargetForPath(pathname) {
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
? path.posix.join(pathname, "index.html")
: pathname;
}
// ----------------------------- HTML CHECK -----------------------------
function isHtmlFile(filePath, contentType, firstBytes) {
if (contentType && /text\/html/i.test(String(contentType))) return true;
const ext = path.extname(filePath).toLowerCase();
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
const head = (firstBytes || "").toString("utf8", 0, 512);
return /<!doctype html/i.test(head) || /<html[\\s>]/i.test(head);
}
// ----------------------------- Archive API -----------------------------
async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
const cdx = new URL("https://web.archive.org/cdx/search/xd");
const params = new URLSearchParams();
params.set("output", "json");
params.set("url", baseUrl);
params.set("fl", "timestamp,original");
params.set("collapse", "digest");
params.set("gzip", "false");
if (!all) params.append("filter", "statuscode:200");
if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
if (pageIndex != null) params.set("page", String(pageIndex));
cdx.search = params.toString();
try {
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
const text = await res.text();
let json = [];
try {
json = JSON.parse(text);
} catch {
// silent: treat as empty page
return [];
}
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
json.shift();
}
return json || [];
} catch {
// silent: skip broken page
return [];
}
}
// ----------------------------- DOWNLOADER CLASS -----------------------------
class WaybackMachineDownloader {
constructor(params) {
this.base_url = params.base_url;
this.exact_url = !!params.exact_url;
this.directory = params.directory || null;
this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
this.download_external_assets = params.download_external_assets || false;
this.rewrite_mode = params.rewrite_mode || "as-is";
this.rewrite_links = this.rewrite_mode === "relative";
this.canonical_action = params.canonical_action || "keep";
this._processed = 0;
}
// Create a human-readable backup folder name, preserving IDNs
backup_name() {
try {
if (this.base_url.includes("//")) {
const u = new URL(this.base_url);
return domainToUnicode(u.host);
}
} catch {}
return this.base_url;
}
// Resolve output directory
backup_path() {
if (this.directory) {
return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
}
return path.join("websites", this.backup_name(), path.sep);
}
// Fetch and merge snapshot lists
async get_all_snapshots_to_consider() {
console.log("Getting snapshot pages");
const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
let list = [];
list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }));
process.stdout.write(".");
if (!this.exact_url) {
const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*";
for (let i = 0; i < 100; i++) {
const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
if (!batch || batch.length === 0) break;
list = list.concat(batch);
process.stdout.write(".");
}
}
console.log(` found ${list.length} snapshots to consider.\n`);
return list;
}
// Choose the latest timestamp per unique pathname
async get_file_list_by_timestamp() {
const curated = new Map();
const all = await this.get_all_snapshots_to_consider();
for (const pair of all) {
const ts = pair && pair[0];
const url = pair && pair[1];
if (!ts || !url) continue;
try {
const u = new URL(url);
const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
const prev = curated.get(file_id);
if (!prev || prev.timestamp <= ts) {
curated.set(file_id, { file_url: url, timestamp: ts, file_id });
}
} catch {}
}
const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id }));
arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
return arr;
}
// Replace Windows-hostile characters when running on Windows
_windowsSanitize(p) {
if (process.platform !== "win32") return p;
return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
}
// Ensure directory exists
async _structure_dir_path(dir_path) {
try {
await mkdir(dir_path, { recursive: true });
} catch (e) {
if (!e || e.code !== "EEXIST") throw e;
}
}
// Compute local file paths for a given archived URL
_determine_paths(file_url, file_id) {
if (!file_url || !file_id) return null;
if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null;
if (file_id.length > 200) return null;
const backup = this.backup_path();
const parts = file_id.split("/").filter(Boolean);
let dir_path, file_path;
if (file_id === "") {
dir_path = backup;
file_path = path.join(backup, "index.html");
} else {
const lastPart = parts[parts.length - 1] || "";
if (file_url.endsWith("/") || !lastPart.includes(".")) {
dir_path = path.join(backup, ...parts);
file_path = path.join(dir_path, "index.html");
} else {
dir_path = path.join(backup, ...parts.slice(0, -1));
file_path = path.join(backup, ...parts);
}
}
dir_path = this._windowsSanitize(dir_path);
file_path = this._windowsSanitize(file_path);
return { dir_path, file_path };
}
// Download a single asset (img/css/js/etc.) referenced from an HTML page
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
try {
if (fs.existsSync(file_path)) return file_path;
await this._structure_dir_path(dir_path);
const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`;
let res;
try {
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
} catch (e) {
console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
return null;
}
if (!res.ok || !res.body) {
console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`);
return null;
}
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(file_path);
Readable.fromWeb(res.body).pipe(ws);
ws.on("finish", resolve);
ws.on("error", reject);
});
return file_path;
} catch (e) {
console.log(`Asset download failed: ${assetUrl}${e}`);
return null;
}
}
// Parse saved HTML, optionally rewrite internal links to relative and fetch assets
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
try {
const backupRoot = this.backup_path();
let html = fs.readFileSync(htmlPath, "utf8");
const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is
const site = new URL(this.base_url);
const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, ""));
const baseDir = path.dirname(htmlPath);
const downloadTasks = [];
// ----------- ASSETS -----------
$("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
const attr = el.tagName === "link" ? "href" : "src";
const val = $(el).attr(attr);
if (!val) return;
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
if (isInternal || this.download_external_assets) {
const file_id = decodeURIComponent(u.pathname);
let paths;
try {
paths = this._determine_paths(abs, file_id);
} catch (e) {
console.log(`Invalid path for asset ${abs}: ${e}`);
return;
}
if (!paths) return;
const { dir_path, file_path } = paths;
if (this.rewrite_links) {
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
}
if (!fs.existsSync(file_path)) {
downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path));
}
}
} catch {}
});
// ----------- INTERNAL LINKS (pages/forms) -----------
if (this.rewrite_links) {
$("a[href], form[action]").each((_, el) => {
const attr = el.tagName === "a" ? "href" : "action";
const val = $(el).attr(attr);
if (!val) return;
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
if (isInternal) {
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
}
} catch {}
});
}
await Promise.all(downloadTasks);
if (this.canonical_action === "remove") {
$("link[rel=\"canonical\"]").remove();
}
fs.writeFileSync(htmlPath, $.html(), "utf8");
} catch (e) {
console.log(`HTML processing error: ${e}`);
}
}
// Download one file from the snapshot list (page or asset saved by CDX)
async _download_single(file_remote_info, total) {
const file_url = String(file_remote_info.file_url);
const file_id = file_remote_info.file_id;
const file_timestamp = file_remote_info.timestamp;
let paths;
try {
paths = this._determine_paths(file_url, file_id);
} catch (e) {
console.log(`Invalid path for ${file_url}: ${e}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
if (!paths) {
console.log(`Skipping invalid URL: ${file_url}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
const { dir_path, file_path } = paths;
if (fs.existsSync(file_path)) {
this._processed++;
renderProgress(this._processed, total);
return;
}
try {
await this._structure_dir_path(dir_path);
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
let res;
try {
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
} catch (e) {
console.log(`Skipping ${file_url}, fetch failed: ${e}`);
return;
}
if (!res.ok || !res.body) {
console.log(`Skipping ${file_url}, bad response ${res.status}`);
return;
}
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(file_path);
Readable.fromWeb(res.body).pipe(ws);
ws.on("finish", resolve);
ws.on("error", reject);
});
const contentType = res.headers.get("content-type");
const ext = path.extname(file_path).toLowerCase();
const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
if (looksHtml) {
await this._process_html_assets(file_path, file_url, file_timestamp);
}
} catch (e) {
console.log(`Download failed for ${file_url}: ${e}`);
} finally {
this._processed++;
renderProgress(this._processed, total);
}
}
// Orchestrate downloads with concurrency
async download_files() {
const startTime = Date.now();
console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
const list = await this.get_file_list_by_timestamp();
if (list.length === 0) {
console.log("No files to download.");
return;
}
const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
const limit = pLimit(concurrency);
this._processed = 0;
await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
const endTime = Date.now();
console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
}
}
// ============================= INTERACTIVE RUN =============================
function ask(rl, question) {
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
}
async function interactiveMain() {
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
let base_url;
while (true) {
base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): ");
if (!base_url) continue;
try {
new URL(base_url);
break;
} catch {
console.log("Please enter a valid URL.\n");
}
}
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
let rewrite_mode = "as-is";
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
let canonical_action = "keep";
if (rewrite_mode === "relative") {
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
if ((c || '').toLowerCase() === "remove") canonical_action = "remove";
}
let threads_count = await ask(rl, "How many download threads? (default 3): ");
threads_count = parseInt(threads_count || "3", 10);
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
const download_external_assets = /^y(es)?$/i.test(ext);
rl.close();
const dl = new WaybackMachineDownloader({
base_url,
exact_url,
directory: directory || null,
from_timestamp: from_timestamp || 0,
to_timestamp: to_timestamp || 0,
threads_count,
rewrite_mode,
canonical_action,
download_external_assets,
});
await dl.download_files();
}
const isDirectRun =
import.meta.url === `file://${process.argv[1]}` ||
import.meta.url === pathToFileURL(process.argv[1]).href;
if (isDirectRun) {
interactiveMain().catch((err) => {
console.error(`FATAL: ${err?.stack || err}`);
process.exit(1);
});
}
export { WaybackMachineDownloader };

View File

@@ -0,0 +1,39 @@
/*
* Wayback Machine Downloader 0.3.0 by WhitelightSEO
* Run: node index.js
*/
import { pathToFileURL } from "url";
import { setDebugMode, getDebugMode, debugLog } from "./lib/logger.js";
import { WaybackMachineDownloader } from "./lib/downloader.js";
const DEBUG_MODE = false;
setDebugMode(DEBUG_MODE);
const isDirectRun = (() => {
const entryArg = process.argv && process.argv.length > 1 ? process.argv[1] : null;
if (!entryArg) return false;
if (import.meta.url === `file://${entryArg}`) {
return true;
}
try {
return import.meta.url === pathToFileURL(entryArg).href;
} catch (e) {
debugLog(`Failed to resolve entry script URL: ${e}`);
return false;
}
})();
if (isDirectRun) {
import("./cli.js")
.then(({ interactiveMain }) => interactiveMain())
.catch((err) => {
console.error(`FATAL: ${err?.stack || err}`);
process.exit(1);
});
}
export { WaybackMachineDownloader, DEBUG_MODE, setDebugMode, getDebugMode };

View File

@@ -0,0 +1,392 @@
import fs from "fs";
import path from "path";
import { mkdir } from "fs/promises";
import { load } from "cheerio";
import { Readable } from "stream";
import { domainToUnicode } from "url";
import { debugLog } from "./logger.js";
import {
relativeLink,
ensureLocalTargetForPath,
isCssResource,
} from "./utils.js";
class AssetManager {
constructor({
backupPathResolver,
rewriteLinks,
canonicalAction,
downloadExternalAssets,
baseHostUnicode,
snapshotIndex,
}) {
this.backupPathResolver = backupPathResolver;
this.rewriteLinks = !!rewriteLinks;
this.canonicalAction = canonicalAction || "keep";
this.downloadExternalAssets = !!downloadExternalAssets;
this.baseHostUnicode = (baseHostUnicode || "").toLowerCase();
this.snapshotIndex = snapshotIndex || null;
}
setSnapshotIndex(index) {
this.snapshotIndex = index;
}
get backupPath() {
const resolver = this.backupPathResolver;
return typeof resolver === "function" ? resolver() : resolver;
}
windowsSanitize(p) {
if (process.platform !== "win32") return p;
return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
}
async ensureDir(dirPath) {
try {
await mkdir(dirPath, { recursive: true });
} catch (e) {
if (!e || e.code !== "EEXIST") throw e;
}
}
determinePaths(fileUrl, fileId) {
if (!fileUrl || !fileId) return null;
if (fileUrl.startsWith("data:") || fileUrl.startsWith("javascript:")) return null;
if (fileId.length > 200) return null;
const backup = this.backupPath;
const parts = fileId.split("/").filter(Boolean);
let dirPath;
let filePath;
if (fileId === "") {
dirPath = backup;
filePath = path.join(backup, "index.html");
} else {
const lastPart = parts[parts.length - 1] || "";
if (fileUrl.endsWith("/") || !lastPart.includes(".")) {
dirPath = path.join(backup, ...parts);
filePath = path.join(dirPath, "index.html");
} else {
dirPath = path.join(backup, ...parts.slice(0, -1));
filePath = path.join(backup, ...parts);
}
}
dirPath = this.windowsSanitize(dirPath);
filePath = this.windowsSanitize(filePath);
return { dirPath, filePath };
}
resolveAssetTimestamp(assetUrl, fallbackTimestamp) {
if (!this.snapshotIndex) return fallbackTimestamp || 0;
return this.snapshotIndex.resolve(assetUrl, fallbackTimestamp);
}
async downloadAsset(assetUrl, pageTimestamp, filePath, dirPath) {
try {
if (fs.existsSync(filePath)) return filePath;
await this.ensureDir(dirPath);
const assetTimestamp = this.resolveAssetTimestamp(assetUrl, pageTimestamp);
if (!assetTimestamp) {
debugLog(`Skipping asset ${assetUrl}, no timestamp available in range.`);
return null;
}
const snapshotUrl = `https://web.archive.org/web/${assetTimestamp}id_/${assetUrl}`;
let res;
try {
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
} catch (e) {
debugLog(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
return null;
}
if (!res.ok || !res.body) {
debugLog(`Skipping asset ${assetUrl}, bad response ${res.status}`);
return null;
}
const contentType = res.headers.get("content-type") || "";
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(filePath);
Readable.fromWeb(res.body).pipe(ws);
ws.on("finish", resolve);
ws.on("error", reject);
});
if (this.rewriteLinks && isCssResource(filePath, assetUrl, contentType)) {
await this.rewriteCssFile(filePath, assetUrl, assetTimestamp);
}
return filePath;
} catch (e) {
debugLog(`Asset download failed: ${assetUrl}${e}`);
return null;
}
}
async rewriteCssContent(cssContent, cssSourceUrl, pageTimestamp, { baseDir, excludePath } = {}) {
if (!this.rewriteLinks) {
return { css: cssContent, downloads: [] };
}
if (!cssContent || !cssContent.trim()) {
return { css: cssContent, downloads: [] };
}
const siteHost = this.baseHostUnicode;
const downloads = [];
const seenPaths = new Set();
let updatedContent = cssContent;
let cssChanged = false;
const processReference = (rawValue) => {
if (!rawValue) return null;
const trimmed = rawValue.trim();
if (!trimmed) return null;
if (/^(data:|javascript:|#)/i.test(trimmed)) return null;
let absoluteUrl;
try {
absoluteUrl = new URL(trimmed, cssSourceUrl).toString();
} catch {
return null;
}
let parsed;
try {
parsed = new URL(absoluteUrl);
} catch {
return null;
}
if (!/^https?:$/i.test(parsed.protocol)) return null;
const normalizedHost = domainToUnicode(parsed.hostname.replace(/^www\./, "")).toLowerCase();
const isInternal = normalizedHost === siteHost;
if (!isInternal && !this.downloadExternalAssets) return null;
let fileId;
try {
fileId = decodeURIComponent(parsed.pathname);
} catch {
fileId = parsed.pathname;
}
let paths;
try {
paths = this.determinePaths(absoluteUrl, fileId);
} catch {
return null;
}
if (!paths) return null;
const { dirPath, filePath } = paths;
const assetTimestamp = this.resolveAssetTimestamp(absoluteUrl, pageTimestamp);
if (
filePath &&
(!excludePath || path.resolve(filePath) !== path.resolve(excludePath))
) {
const key = path.resolve(filePath);
if (!fs.existsSync(filePath) && !seenPaths.has(key)) {
seenPaths.add(key);
downloads.push(this.downloadAsset(absoluteUrl, assetTimestamp, filePath, dirPath));
}
}
const relativeBase = baseDir || path.dirname(filePath);
const relativePath = relativeLink(relativeBase, filePath) + (parsed.hash || "");
return {
original: trimmed,
replacement: relativePath,
};
};
const urlPattern = /url\(\s*(['"]?)([^'")]+)\1\s*\)/gi;
updatedContent = updatedContent.replace(urlPattern, (match, quote, value) => {
const info = processReference(value);
if (!info) return match;
if (info.replacement === info.original) return match;
cssChanged = true;
const q = quote || "";
return `url(${q}${info.replacement}${q})`;
});
const importPattern = /@import\s+(?!url\()\s*(['"])([^'"]+)\1/gi;
updatedContent = updatedContent.replace(importPattern, (match, quote, value) => {
const info = processReference(value);
if (!info) return match;
if (info.replacement === info.original) return match;
cssChanged = true;
return match.replace(value, info.replacement);
});
return {
css: cssChanged && updatedContent !== cssContent ? updatedContent : cssContent,
downloads,
};
}
async rewriteCssFile(cssPath, cssSourceUrl, pageTimestamp) {
if (!this.rewriteLinks) return;
let cssContent;
try {
cssContent = fs.readFileSync(cssPath, "utf8");
} catch {
return;
}
const cssDir = path.dirname(cssPath);
const { css: updatedContent, downloads } = await this.rewriteCssContent(
cssContent,
cssSourceUrl,
pageTimestamp,
{
baseDir: cssDir,
excludePath: cssPath,
}
);
if (downloads.length > 0) {
await Promise.all(downloads);
}
if (updatedContent !== cssContent) {
fs.writeFileSync(cssPath, updatedContent, "utf8");
}
}
async processHtml(htmlPath, pageUrl, pageTimestamp) {
try {
let html = fs.readFileSync(htmlPath, "utf8");
const $ = load(html, { decodeEntities: false });
const siteHost = this.baseHostUnicode;
const baseDir = path.dirname(htmlPath);
const backupRoot = this.backupPath;
const downloadTasks = [];
const handleCssFragment = async (cssText) => {
const { css: updatedCss, downloads } = await this.rewriteCssContent(
cssText,
pageUrl,
pageTimestamp,
{ baseDir }
);
if (downloads.length > 0) {
downloadTasks.push(...downloads);
}
return updatedCss;
};
$("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
const attr = el.tagName === "link" ? "href" : "src";
const val = $(el).attr(attr);
if (!val) return;
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const normalizedHost = domainToUnicode(u.hostname.replace(/^www\./, "")).toLowerCase();
const isInternal = normalizedHost === siteHost;
if (isInternal || this.downloadExternalAssets) {
let fileId;
try {
fileId = decodeURIComponent(u.pathname);
} catch {
fileId = u.pathname;
}
let paths;
try {
paths = this.determinePaths(abs, fileId);
} catch (e) {
console.log(`Invalid path for asset ${abs}: ${e}`);
return;
}
if (!paths) return;
const { dirPath, filePath } = paths;
if (this.rewriteLinks) {
const normPath = fileId + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
}
if (!fs.existsSync(filePath)) {
downloadTasks.push(
this.downloadAsset(abs, pageTimestamp, filePath, dirPath)
);
}
}
} catch {}
});
const styleNodes = $("style").toArray();
for (const node of styleNodes) {
const cssText = $(node).html();
if (!cssText) continue;
const updated = await handleCssFragment(cssText);
if (updated !== cssText) {
$(node).text(updated);
}
}
const inlineStyled = $("[style]").toArray();
for (const node of inlineStyled) {
const styleAttr = $(node).attr("style");
if (!styleAttr) continue;
const updated = await handleCssFragment(styleAttr);
if (updated !== styleAttr) {
$(node).attr("style", updated);
}
}
if (this.rewriteLinks) {
$("a[href], form[action]").each((_, el) => {
const attr = el.tagName === "a" ? "href" : "action";
const val = $(el).attr(attr);
if (!val) return;
try {
const abs = new URL(val, pageUrl).toString();
const u = new URL(abs);
const normalizedHost = domainToUnicode(u.hostname.replace(/^www\./, "")).toLowerCase();
const isInternal = normalizedHost === siteHost;
if (isInternal) {
let normPath;
try {
normPath = decodeURIComponent(u.pathname);
} catch {
normPath = u.pathname;
}
normPath += u.hash || "";
const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
}
} catch {}
});
}
await Promise.all(downloadTasks);
if (this.canonicalAction === "remove") {
$("link[rel=\"canonical\"]").remove();
}
fs.writeFileSync(htmlPath, $.html(), "utf8");
} catch (e) {
console.log(`HTML processing error: ${e}`);
}
}
}
export { AssetManager };

View File

@@ -0,0 +1,222 @@
import fs from "fs";
import path from "path";
import { domainToUnicode } from "url";
import pLimit from "p-limit";
import { Readable } from "stream";
import { debugLog } from "./logger.js";
import { renderProgress, normalizeBaseUrlInput, isHtmlFile, isCssResource } from "./utils.js";
import { SnapshotIndex } from "./snapshot-index.js";
import { AssetManager } from "./asset-manager.js";
async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
const cdx = new URL("https://web.archive.org/cdx/search/xd");
const params = new URLSearchParams();
params.set("output", "json");
params.set("url", baseUrl);
params.set("fl", "timestamp,original");
params.set("collapse", "digest");
params.set("gzip", "false");
if (!all) params.append("filter", "statuscode:200");
if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
if (pageIndex != null) params.set("page", String(pageIndex));
cdx.search = params.toString();
try {
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
const text = await res.text();
let json = [];
try {
json = JSON.parse(text);
} catch {
return [];
}
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
json.shift();
}
return json || [];
} catch {
return [];
}
}
class WaybackMachineDownloader {
constructor(params) {
const normalized = params.normalized_base || normalizeBaseUrlInput(params.base_url);
this.base_url = normalized.canonicalUrl;
this.base_variants = normalized.variants;
this.base_host_unicode = (normalized.unicodeHost || normalized.bareHost).toLowerCase();
this.exact_url = !!params.exact_url;
this.directory = params.directory || null;
this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
this.download_external_assets = params.download_external_assets || false;
this.rewrite_mode = params.rewrite_mode || "as-is";
this.rewrite_links = this.rewrite_mode === "relative";
this.canonical_action = params.canonical_action || "keep";
this._processed = 0;
this.snapshotIndex = null;
this.assetManager = new AssetManager({
backupPathResolver: () => this.backup_path(),
rewriteLinks: this.rewrite_links,
canonicalAction: this.canonical_action,
downloadExternalAssets: this.download_external_assets,
baseHostUnicode: this.base_host_unicode,
snapshotIndex: null,
});
}
backup_name() {
try {
if (this.base_url.includes("//")) {
const u = new URL(this.base_url);
return domainToUnicode(u.host);
}
} catch {}
return this.base_url;
}
backup_path() {
if (this.directory) {
return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
}
return path.join("websites", this.backup_name(), path.sep);
}
async get_all_snapshots_to_consider() {
console.log("Getting snapshot pages");
const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
let list = [];
const bases = this.base_variants && this.base_variants.length > 0 ? this.base_variants : [this.base_url];
for (const base of bases) {
list = list.concat(await getRawListFromApi({ baseUrl: base, pageIndex: null, ...httpOpts }));
process.stdout.write(".");
if (!this.exact_url) {
const wildcard = base.endsWith("/*") ? base : base.replace(/\/*$/, "") + "/*";
for (let i = 0; i < 100; i++) {
const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
if (!batch || batch.length === 0) break;
list = list.concat(batch);
process.stdout.write(".");
}
}
}
console.log(` found ${list.length} snapshots to consider.\n`);
return list;
}
async get_file_list_by_timestamp() {
const index = new SnapshotIndex();
const all = await this.get_all_snapshots_to_consider();
for (const pair of all) {
const ts = pair && pair[0];
const url = pair && pair[1];
if (!ts || !url) continue;
index.register(url, ts);
}
const manifest = index.getManifest();
this.snapshotIndex = index;
this.assetManager.setSnapshotIndex(index);
return manifest;
}
async _download_single(file_remote_info, total) {
const file_url = String(file_remote_info.file_url);
const file_id = file_remote_info.file_id;
const file_timestamp = file_remote_info.timestamp;
let paths;
try {
paths = this.assetManager.determinePaths(file_url, file_id);
} catch (e) {
console.log(`Invalid path for ${file_url}: ${e}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
if (!paths) {
console.log(`Skipping invalid URL: ${file_url}`);
this._processed++;
renderProgress(this._processed, total);
return;
}
const { dirPath, filePath } = paths;
if (fs.existsSync(filePath)) {
this._processed++;
renderProgress(this._processed, total);
return;
}
try {
await this.assetManager.ensureDir(dirPath);
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
let res;
try {
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
} catch (e) {
debugLog(`Skipping ${file_url}, fetch failed: ${e}`);
return;
}
if (!res.ok || !res.body) {
debugLog(`Skipping ${file_url}, bad response ${res.status}`);
return;
}
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(filePath);
Readable.fromWeb(res.body).pipe(ws);
ws.on("finish", resolve);
ws.on("error", reject);
});
const contentType = res.headers.get("content-type") || "";
const ext = path.extname(filePath).toLowerCase();
const looksHtml = isHtmlFile(filePath, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
if (this.rewrite_links && isCssResource(filePath, file_url, contentType)) {
await this.assetManager.rewriteCssFile(filePath, file_url, file_timestamp);
}
if (this.rewrite_links && looksHtml) {
await this.assetManager.processHtml(filePath, file_url, file_timestamp);
}
} catch (e) {
debugLog(`Download failed for ${file_url}: ${e}`);
} finally {
this._processed++;
renderProgress(this._processed, total);
}
}
async download_files() {
const startTime = Date.now();
console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
const list = await this.get_file_list_by_timestamp();
if (list.length === 0) {
console.log("No files to download.");
return;
}
const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
const limit = pLimit(concurrency);
this._processed = 0;
await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
const endTime = Date.now();
console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
}
}
export { WaybackMachineDownloader };

View File

@@ -0,0 +1,21 @@
let debugMode = false;
function setDebugMode(value) {
debugMode = !!value;
}
function getDebugMode() {
return debugMode;
}
function debugLog(...args) {
if (debugMode) {
console.log(...args);
}
}
function infoLog(...args) {
console.log(...args);
}
export { setDebugMode, getDebugMode, debugLog, infoLog };

View File

@@ -0,0 +1,138 @@
class SnapshotIndex {
constructor() {
this.byPath = new Map();
this.byPathAndQuery = new Map();
this.lookupByPath = null;
this.lookupByPathAndQuery = null;
this.manifestCache = null;
}
register(url, timestamp) {
if (!url || !timestamp) return;
let parsed;
try {
parsed = new URL(url);
} catch {
return;
}
let filePath;
try {
filePath = decodeURIComponent(parsed.pathname);
} catch {
filePath = parsed.pathname;
}
const search = parsed.search || "";
const queryKey = `${filePath}${search}`;
const normalizedTimestamp = String(timestamp);
const currentByPath = this.byPath.get(filePath);
if (!currentByPath || String(currentByPath.timestamp) <= normalizedTimestamp) {
this.byPath.set(filePath, {
file_url: url,
timestamp: normalizedTimestamp,
file_id: filePath,
});
}
const currentByQuery = this.byPathAndQuery.get(queryKey);
if (!currentByQuery || String(currentByQuery.timestamp) <= normalizedTimestamp) {
this.byPathAndQuery.set(queryKey, {
file_url: url,
timestamp: normalizedTimestamp,
file_id: filePath,
});
}
this.lookupByPath = null;
this.lookupByPathAndQuery = null;
this.manifestCache = null;
}
buildCaches() {
if (this.manifestCache) {
return;
}
const manifest = Array.from(this.byPath.entries()).map(([file_id, value]) => ({
...value,
file_id,
}));
manifest.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
const byPath = new Map();
const byQuery = new Map();
for (const entry of manifest) {
const { file_url, file_id, timestamp } = entry;
if (file_id && timestamp && !byPath.has(file_id)) {
byPath.set(file_id, timestamp);
}
if (file_url) {
try {
const u = new URL(file_url);
let decodedPath;
try {
decodedPath = decodeURIComponent(u.pathname);
} catch {
decodedPath = u.pathname;
}
const pathKey = `${decodedPath}${u.search || ""}`;
if (pathKey && timestamp && !byQuery.has(pathKey)) {
byQuery.set(pathKey, timestamp);
}
} catch {}
}
}
for (const [queryKey, entry] of this.byPathAndQuery.entries()) {
const ts = entry && entry.timestamp;
if (!queryKey || !ts) continue;
if (!byQuery.has(queryKey)) {
byQuery.set(queryKey, ts);
}
const basePath = queryKey.replace(/\?.*$/, "");
if (basePath && !byPath.has(basePath)) {
byPath.set(basePath, ts);
}
}
this.manifestCache = manifest;
this.lookupByPath = byPath;
this.lookupByPathAndQuery = byQuery;
}
getManifest() {
this.buildCaches();
return this.manifestCache || [];
}
resolve(assetUrl, fallbackTimestamp) {
this.buildCaches();
let resolved = fallbackTimestamp || 0;
if (!assetUrl) return resolved;
try {
const u = new URL(assetUrl);
let decodedPath;
try {
decodedPath = decodeURIComponent(u.pathname);
} catch {
decodedPath = u.pathname;
}
const queryKey = `${decodedPath}${u.search || ""}`;
if (this.lookupByPathAndQuery && this.lookupByPathAndQuery.has(queryKey)) {
resolved = this.lookupByPathAndQuery.get(queryKey);
} else if (this.lookupByPath && this.lookupByPath.has(decodedPath)) {
resolved = this.lookupByPath.get(decodedPath);
}
} catch {}
return resolved;
}
}
export { SnapshotIndex };

View File

@@ -0,0 +1,117 @@
import path from "path";
import { domainToUnicode } from "url";
function renderProgress(current, total) {
const width = 40;
const ratio = total > 0 ? current / total : 0;
const filled = Math.round(ratio * width);
const bar = "█".repeat(filled) + "-".repeat(width - filled);
process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
if (current === total) process.stdout.write("\n");
}
function toPosix(p) {
return p.split(path.sep).join("/");
}
function relativeLink(fromDir, toFile) {
const rel = path.relative(fromDir, toFile);
return toPosix(rel || path.basename(toFile));
}
function ensureLocalTargetForPath(pathname) {
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
? path.posix.join(pathname, "index.html")
: pathname;
}
function normalizeBaseUrlInput(input) {
if (!input || typeof input !== "string") {
throw new Error("Base URL must be a non-empty string");
}
let raw = input.trim();
if (!raw) {
throw new Error("Base URL must not be empty");
}
if (!/^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(raw)) {
raw = `https://${raw}`;
}
let parsed;
try {
parsed = new URL(raw);
} catch (e) {
throw new Error(`Invalid URL: ${e.message}`);
}
if (!/^https?:$/i.test(parsed.protocol)) {
throw new Error("Only http and https protocols are supported");
}
const asciiHost = parsed.hostname.toLowerCase();
if (!asciiHost) {
throw new Error("URL must contain a hostname");
}
const bareHost = asciiHost.replace(/^www\./, "");
const unicodeHost = domainToUnicode(bareHost);
const port = parsed.port ? `:${parsed.port}` : "";
const basePath = parsed.pathname && parsed.pathname !== "/" ? parsed.pathname.replace(/\/+$/, "") : "";
const canonicalUrl = `https://${bareHost}${port}${basePath}`;
const hostSet = new Set([`${bareHost}${port}`]);
if (asciiHost !== bareHost) {
hostSet.add(`${asciiHost}${port}`);
} else if (bareHost && bareHost.includes(".")) {
hostSet.add(`www.${bareHost}${port}`);
}
const protocols = ["https:", "http:"];
const variants = new Set();
for (const protocol of protocols) {
for (const host of hostSet) {
variants.add(`${protocol}//${host}${basePath}`);
}
}
return {
canonicalUrl,
variants: Array.from(variants),
bareHost,
unicodeHost,
};
}
function isHtmlFile(filePath, contentType, firstBytes) {
if (contentType && /text\/html/i.test(String(contentType))) return true;
const ext = path.extname(filePath).toLowerCase();
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
const head = (firstBytes || "").toString("utf8", 0, 512);
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
}
function isCssResource(filePath, resourceUrl, contentType) {
const ext = path.extname(filePath || "").toLowerCase();
if (ext === ".css") return true;
if (contentType && /text\/css/i.test(String(contentType))) return true;
if (resourceUrl) {
try {
const u = new URL(resourceUrl);
if (/\.css(?:$|\?)/i.test(u.pathname)) return true;
} catch {}
}
return false;
}
export {
renderProgress,
toPosix,
relativeLink,
ensureLocalTargetForPath,
normalizeBaseUrlInput,
isHtmlFile,
isCssResource,
};

View File

@@ -1,14 +1,22 @@
{ {
"name": "wayback-downloader", "name": "wayback-machine-downloader",
"version": "0.2.1", "version": "0.2.1",
"description": "Interactive Wayback Machine downloader for archiving websites locally.", "description": "Interactive Wayback Machine downloader for archiving websites locally.",
"type": "module", "type": "module",
"main": "downloader.js", "main": "./index.js",
"exports": {
".": "./index.js",
"./downloader": "./lib/downloader.js",
"./downloader.js": "./lib/downloader.js",
"./cli": "./cli.js",
"./package.json": "./package.json"
},
"bin": { "bin": {
"wayback-downloader": "downloader.js" "wayback-machine-downloader": "./cli.js"
}, },
"scripts": { "scripts": {
"start": "node downloader.js" "start": "node cli.js",
"download": "node cli.js"
}, },
"dependencies": { "dependencies": {
"cheerio": "^1.0.0-rc.12", "cheerio": "^1.0.0-rc.12",
@@ -17,19 +25,25 @@
"engines": { "engines": {
"node": ">=18" "node": ">=18"
}, },
"files": [
"cli.js",
"index.js",
"lib"
],
"keywords": [ "keywords": [
"wayback-machine-downloader", "wayback",
"web-archive-downloder", "archive",
"archiver" "downloader",
"wayback-machine"
], ],
"author": "birbwatcher", "author": "birbwatcher",
"license": "MIT", "license": "MIT",
"repository": { "repository": {
"type": "git", "type": "git",
"url": "https://github.com/birbwatcher/wayback-downloader.git" "url": "https://github.com/birbwatcher/wayback-machine-downloader.git"
}, },
"bugs": { "bugs": {
"url": "https://github.com/birbwatcher/wayback-downloader/issues" "url": "https://github.com/birbwatcher/wayback-machine-downloader/issues"
}, },
"homepage": "https://github.com/birbwatcher/wayback-downloader#readme" "homepage": "https://github.com/birbwatcher/wayback-machine-downloader#readme"
} }