mirror of
https://github.com/birbwatcher/wayback-machine-downloader.git
synced 2026-01-29 01:40:41 +00:00
fix: prevent crashes on invalid paths and silent CDX JSON parse errors
This commit is contained in:
@@ -50,8 +50,12 @@ This webarchive website downloader has an interactive interface, supports downlo
|
||||
|
||||
```bash
|
||||
git clone https://github.com/birbwatcher/wayback-machine-downloader.git
|
||||
```
|
||||
go to inner folder "wayback-machine-downloader"
|
||||
```bash
|
||||
cd wayback-machine-downloader
|
||||
|
||||
```
|
||||
```bash
|
||||
# Install dependencies
|
||||
npm install
|
||||
```
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM)
|
||||
* Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM)
|
||||
* Run: node downloader.js
|
||||
*/
|
||||
|
||||
@@ -21,9 +21,7 @@ function renderProgress(current, total) {
|
||||
const ratio = total > 0 ? current / total : 0;
|
||||
const filled = Math.round(ratio * width);
|
||||
const bar = "█".repeat(filled) + "-".repeat(width - filled);
|
||||
process.stdout.write(
|
||||
`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`
|
||||
);
|
||||
process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
|
||||
if (current === total) process.stdout.write("\n");
|
||||
}
|
||||
|
||||
@@ -47,17 +45,12 @@ function isHtmlFile(filePath, contentType, firstBytes) {
|
||||
const ext = path.extname(filePath).toLowerCase();
|
||||
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
|
||||
const head = (firstBytes || "").toString("utf8", 0, 512);
|
||||
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
|
||||
return /<!doctype html/i.test(head) || /<html[\\s>]/i.test(head);
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------- Archive API -----------------------------
|
||||
async function getRawListFromApi({
|
||||
baseUrl,
|
||||
pageIndex,
|
||||
all,
|
||||
fromTimestamp,
|
||||
toTimestamp,
|
||||
}) {
|
||||
async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
|
||||
const cdx = new URL("https://web.archive.org/cdx/search/xd");
|
||||
const params = new URLSearchParams();
|
||||
params.set("output", "json");
|
||||
@@ -66,43 +59,41 @@ async function getRawListFromApi({
|
||||
params.set("collapse", "digest");
|
||||
params.set("gzip", "false");
|
||||
if (!all) params.append("filter", "statuscode:200");
|
||||
if (fromTimestamp && Number(fromTimestamp) !== 0)
|
||||
params.set("from", String(fromTimestamp));
|
||||
if (toTimestamp && Number(toTimestamp) !== 0)
|
||||
params.set("to", String(toTimestamp));
|
||||
if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
|
||||
if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
|
||||
if (pageIndex != null) params.set("page", String(pageIndex));
|
||||
cdx.search = params.toString();
|
||||
|
||||
try {
|
||||
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
|
||||
const text = await res.text();
|
||||
const json = JSON.parse(text);
|
||||
if (
|
||||
Array.isArray(json) &&
|
||||
Array.isArray(json[0]) &&
|
||||
json[0].join(",") === "timestamp,original"
|
||||
) {
|
||||
let json = [];
|
||||
try {
|
||||
json = JSON.parse(text);
|
||||
} catch {
|
||||
// silent: treat as empty page
|
||||
return [];
|
||||
}
|
||||
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
|
||||
json.shift();
|
||||
}
|
||||
return json || [];
|
||||
} catch (e) {
|
||||
console.log(`ERROR getRawListFromApi: ${e}`);
|
||||
} catch {
|
||||
// silent: skip broken page
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------- DOWNLOADER CLASS -----------------------------
|
||||
class WaybackMachineDownloader {
|
||||
constructor(params) {
|
||||
this.base_url = params.base_url;
|
||||
this.exact_url = !!params.exact_url;
|
||||
this.directory = params.directory || null;
|
||||
this.from_timestamp = params.from_timestamp
|
||||
? Number(params.from_timestamp)
|
||||
: 0;
|
||||
this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
|
||||
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
|
||||
this.threads_count =
|
||||
params.threads_count != null ? Number(params.threads_count) : 3;
|
||||
this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
|
||||
|
||||
this.download_external_assets = params.download_external_assets || false;
|
||||
|
||||
@@ -113,49 +104,38 @@ class WaybackMachineDownloader {
|
||||
this._processed = 0;
|
||||
}
|
||||
|
||||
// Create a human-readable backup folder name, preserving IDNs
|
||||
backup_name() {
|
||||
try {
|
||||
if (this.base_url.includes("//")) {
|
||||
const u = new URL(this.base_url);
|
||||
return domainToUnicode(u.host); // use human-readable domain
|
||||
return domainToUnicode(u.host);
|
||||
}
|
||||
} catch {}
|
||||
return this.base_url;
|
||||
}
|
||||
|
||||
// Resolve output directory
|
||||
backup_path() {
|
||||
if (this.directory) {
|
||||
return this.directory.endsWith(path.sep)
|
||||
? this.directory
|
||||
: this.directory + path.sep;
|
||||
return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
|
||||
}
|
||||
return path.join("websites", this.backup_name(), path.sep);
|
||||
}
|
||||
|
||||
// Fetch and merge snapshot lists
|
||||
async get_all_snapshots_to_consider() {
|
||||
console.log("Getting snapshot pages");
|
||||
const httpOpts = {
|
||||
all: true,
|
||||
fromTimestamp: this.from_timestamp,
|
||||
toTimestamp: this.to_timestamp,
|
||||
};
|
||||
const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
|
||||
let list = [];
|
||||
|
||||
list = list.concat(
|
||||
await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })
|
||||
);
|
||||
list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }));
|
||||
process.stdout.write(".");
|
||||
|
||||
if (!this.exact_url) {
|
||||
const wildcard = this.base_url.endsWith("/*")
|
||||
? this.base_url
|
||||
: this.base_url.replace(/\/*$/, "") + "/*";
|
||||
const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*";
|
||||
for (let i = 0; i < 100; i++) {
|
||||
const batch = await getRawListFromApi({
|
||||
baseUrl: wildcard,
|
||||
pageIndex: i,
|
||||
...httpOpts,
|
||||
});
|
||||
const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
|
||||
if (!batch || batch.length === 0) break;
|
||||
list = list.concat(batch);
|
||||
process.stdout.write(".");
|
||||
@@ -165,12 +145,15 @@ class WaybackMachineDownloader {
|
||||
return list;
|
||||
}
|
||||
|
||||
|
||||
// Choose the latest timestamp per unique pathname
|
||||
async get_file_list_by_timestamp() {
|
||||
const curated = new Map();
|
||||
const all = await this.get_all_snapshots_to_consider();
|
||||
for (const pair of all) {
|
||||
const ts = pair[0];
|
||||
const url = pair[1];
|
||||
const ts = pair && pair[0];
|
||||
const url = pair && pair[1];
|
||||
if (!ts || !url) continue;
|
||||
try {
|
||||
const u = new URL(url);
|
||||
const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
|
||||
@@ -185,13 +168,13 @@ class WaybackMachineDownloader {
|
||||
return arr;
|
||||
}
|
||||
|
||||
// Replace Windows-hostile characters when running on Windows
|
||||
_windowsSanitize(p) {
|
||||
if (process.platform !== "win32") return p;
|
||||
return p.replace(/[:*?&=<>\\|]/g, (s) =>
|
||||
"%" + s.charCodeAt(0).toString(16)
|
||||
);
|
||||
return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
|
||||
}
|
||||
|
||||
// Ensure directory exists
|
||||
async _structure_dir_path(dir_path) {
|
||||
try {
|
||||
await mkdir(dir_path, { recursive: true });
|
||||
@@ -200,9 +183,10 @@ class WaybackMachineDownloader {
|
||||
}
|
||||
}
|
||||
|
||||
// Compute local file paths for a given archived URL
|
||||
_determine_paths(file_url, file_id) {
|
||||
if (file_url.startsWith("data:") || file_url.startsWith("javascript:"))
|
||||
return null;
|
||||
if (!file_url || !file_id) return null;
|
||||
if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null;
|
||||
if (file_id.length > 200) return null;
|
||||
|
||||
const backup = this.backup_path();
|
||||
@@ -212,15 +196,15 @@ class WaybackMachineDownloader {
|
||||
if (file_id === "") {
|
||||
dir_path = backup;
|
||||
file_path = path.join(backup, "index.html");
|
||||
} else if (
|
||||
file_url.endsWith("/") ||
|
||||
!parts[parts.length - 1].includes(".")
|
||||
) {
|
||||
dir_path = path.join(backup, ...parts);
|
||||
file_path = path.join(dir_path, "index.html");
|
||||
} else {
|
||||
dir_path = path.join(backup, ...parts.slice(0, -1));
|
||||
file_path = path.join(backup, ...parts);
|
||||
const lastPart = parts[parts.length - 1] || "";
|
||||
if (file_url.endsWith("/") || !lastPart.includes(".")) {
|
||||
dir_path = path.join(backup, ...parts);
|
||||
file_path = path.join(dir_path, "index.html");
|
||||
} else {
|
||||
dir_path = path.join(backup, ...parts.slice(0, -1));
|
||||
file_path = path.join(backup, ...parts);
|
||||
}
|
||||
}
|
||||
|
||||
dir_path = this._windowsSanitize(dir_path);
|
||||
@@ -229,6 +213,8 @@ class WaybackMachineDownloader {
|
||||
return { dir_path, file_path };
|
||||
}
|
||||
|
||||
|
||||
// Download a single asset (img/css/js/etc.) referenced from an HTML page
|
||||
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
|
||||
try {
|
||||
if (fs.existsSync(file_path)) return file_path;
|
||||
@@ -261,21 +247,20 @@ class WaybackMachineDownloader {
|
||||
}
|
||||
}
|
||||
|
||||
// Parse saved HTML, optionally rewrite internal links to relative and fetch assets
|
||||
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
|
||||
try {
|
||||
const backupRoot = this.backup_path();
|
||||
let html = fs.readFileSync(htmlPath, "utf8");
|
||||
const $ = load(html);
|
||||
const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is
|
||||
const site = new URL(this.base_url);
|
||||
const siteHost = domainToUnicode(site.hostname.replace(/^www\./, ""));
|
||||
const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, ""));
|
||||
const baseDir = path.dirname(htmlPath);
|
||||
|
||||
const downloadTasks = [];
|
||||
|
||||
// ----------- ASSETS -----------
|
||||
$(
|
||||
"img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]"
|
||||
).each((_, el) => {
|
||||
$("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
|
||||
const attr = el.tagName === "link" ? "href" : "src";
|
||||
const val = $(el).attr(attr);
|
||||
if (!val) return;
|
||||
@@ -283,12 +268,17 @@ class WaybackMachineDownloader {
|
||||
try {
|
||||
const abs = new URL(val, pageUrl).toString();
|
||||
const u = new URL(abs);
|
||||
const isInternal =
|
||||
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
|
||||
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
|
||||
|
||||
if (isInternal || this.download_external_assets) {
|
||||
const file_id = decodeURIComponent(u.pathname);
|
||||
const paths = this._determine_paths(abs, file_id);
|
||||
let paths;
|
||||
try {
|
||||
paths = this._determine_paths(abs, file_id);
|
||||
} catch (e) {
|
||||
console.log(`Invalid path for asset ${abs}: ${e}`);
|
||||
return;
|
||||
}
|
||||
if (!paths) return;
|
||||
const { dir_path, file_path } = paths;
|
||||
|
||||
@@ -300,9 +290,7 @@ class WaybackMachineDownloader {
|
||||
}
|
||||
|
||||
if (!fs.existsSync(file_path)) {
|
||||
downloadTasks.push(
|
||||
this._download_asset(abs, pageTimestamp, file_path, dir_path)
|
||||
);
|
||||
downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path));
|
||||
}
|
||||
}
|
||||
} catch {}
|
||||
@@ -318,8 +306,7 @@ class WaybackMachineDownloader {
|
||||
try {
|
||||
const abs = new URL(val, pageUrl).toString();
|
||||
const u = new URL(abs);
|
||||
const isInternal =
|
||||
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
|
||||
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
|
||||
|
||||
if (isInternal) {
|
||||
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
|
||||
@@ -343,17 +330,30 @@ class WaybackMachineDownloader {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Download one file from the snapshot list (page or asset saved by CDX)
|
||||
async _download_single(file_remote_info, total) {
|
||||
const file_url = String(file_remote_info.file_url);
|
||||
const file_id = file_remote_info.file_id;
|
||||
const file_timestamp = file_remote_info.timestamp;
|
||||
const paths = this._determine_paths(file_url, file_id);
|
||||
|
||||
let paths;
|
||||
try {
|
||||
paths = this._determine_paths(file_url, file_id);
|
||||
} catch (e) {
|
||||
console.log(`Invalid path for ${file_url}: ${e}`);
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!paths) {
|
||||
console.log(`Skipping invalid URL: ${file_url}`);
|
||||
this._processed++;
|
||||
renderProgress(this._processed, total);
|
||||
return;
|
||||
}
|
||||
|
||||
const { dir_path, file_path } = paths;
|
||||
|
||||
if (fs.existsSync(file_path)) {
|
||||
@@ -387,11 +387,7 @@ class WaybackMachineDownloader {
|
||||
|
||||
const contentType = res.headers.get("content-type");
|
||||
const ext = path.extname(file_path).toLowerCase();
|
||||
const looksHtml =
|
||||
isHtmlFile(file_path, contentType, null) ||
|
||||
ext === "" ||
|
||||
ext === ".html" ||
|
||||
ext === ".htm";
|
||||
const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
|
||||
if (looksHtml) {
|
||||
await this._process_html_assets(file_path, file_url, file_timestamp);
|
||||
}
|
||||
@@ -403,43 +399,33 @@ class WaybackMachineDownloader {
|
||||
}
|
||||
}
|
||||
|
||||
// Orchestrate downloads with concurrency
|
||||
async download_files() {
|
||||
const startTime = Date.now();
|
||||
console.log(
|
||||
`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`
|
||||
);
|
||||
console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
|
||||
const list = await this.get_file_list_by_timestamp();
|
||||
if (list.length === 0) {
|
||||
console.log("No files to download.");
|
||||
return;
|
||||
}
|
||||
|
||||
const concurrency =
|
||||
this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
|
||||
const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
|
||||
const limit = pLimit(concurrency);
|
||||
this._processed = 0;
|
||||
await Promise.all(
|
||||
list.map((info) => limit(() => this._download_single(info, list.length)))
|
||||
);
|
||||
await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
|
||||
const endTime = Date.now();
|
||||
console.log(
|
||||
`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(
|
||||
2
|
||||
)}s, saved in ${this.backup_path()} (${list.length} files)`
|
||||
);
|
||||
console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ============================= INTERACTIVE RUN =============================
|
||||
function ask(rl, question) {
|
||||
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
|
||||
}
|
||||
|
||||
async function interactiveMain() {
|
||||
const rl = readline.createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
});
|
||||
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
||||
|
||||
let base_url;
|
||||
while (true) {
|
||||
@@ -463,20 +449,15 @@ async function interactiveMain() {
|
||||
let canonical_action = "keep";
|
||||
if (rewrite_mode === "relative") {
|
||||
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
|
||||
if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
|
||||
if ((c || '').toLowerCase() === "remove") canonical_action = "remove";
|
||||
}
|
||||
|
||||
let threads_count = await ask(rl, "How many download threads? (default 3): ");
|
||||
threads_count = parseInt(threads_count || "3", 10);
|
||||
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
|
||||
|
||||
const exact_url = /^y(es)?$/i.test(
|
||||
await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")
|
||||
);
|
||||
const directory = await ask(
|
||||
rl,
|
||||
"Target directory (leave blank for default websites/<host>/): "
|
||||
);
|
||||
const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
|
||||
const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
|
||||
|
||||
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
|
||||
const download_external_assets = /^y(es)?$/i.test(ext);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "wayback-downloader",
|
||||
"version": "0.2.0",
|
||||
"version": "0.2.1",
|
||||
"description": "Interactive Wayback Machine downloader for archiving websites locally.",
|
||||
"type": "module",
|
||||
"main": "downloader.js",
|
||||
|
||||
Reference in New Issue
Block a user