mirror of
https://github.com/birbwatcher/wayback-machine-downloader.git
synced 2026-01-29 01:40:41 +00:00
Merge pull request #8 from birbwatcher/work
feat: splitted into files and made some fixes
This commit is contained in:
@@ -84,7 +84,7 @@ Got ideas or suggestions? Feel free to open an issue!
|
|||||||
## Run
|
## Run
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
node downloader.js
|
node index.js
|
||||||
```
|
```
|
||||||
|
|
||||||
After launching, an interactive menu will appear with the following questions:
|
After launching, an interactive menu will appear with the following questions:
|
||||||
|
|||||||
88
wayback-machine-downloader/cli.js
Normal file
88
wayback-machine-downloader/cli.js
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
|
||||||
|
import path from "path";
|
||||||
|
import readline from "readline";
|
||||||
|
|
||||||
|
import { WaybackMachineDownloader } from "./lib/downloader.js";
|
||||||
|
import { normalizeBaseUrlInput } from "./lib/utils.js";
|
||||||
|
|
||||||
|
function ask(rl, question) {
|
||||||
|
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
|
||||||
|
}
|
||||||
|
|
||||||
|
async function interactiveMain() {
|
||||||
|
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
||||||
|
|
||||||
|
let normalizedBase;
|
||||||
|
while (true) {
|
||||||
|
const baseInput = await ask(rl, "Enter domain or URL to archive (e.g., example.com): ");
|
||||||
|
if (!baseInput) continue;
|
||||||
|
try {
|
||||||
|
normalizedBase = normalizeBaseUrlInput(baseInput);
|
||||||
|
break;
|
||||||
|
} catch {
|
||||||
|
console.log("Please enter a valid domain or URL.\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const base_url = normalizedBase.canonicalUrl;
|
||||||
|
|
||||||
|
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
||||||
|
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
||||||
|
|
||||||
|
let rewrite_mode = "as-is";
|
||||||
|
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
|
||||||
|
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
|
||||||
|
|
||||||
|
let canonical_action = "keep";
|
||||||
|
if (rewrite_mode === "relative") {
|
||||||
|
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
|
||||||
|
if ((c || "").toLowerCase() === "remove") canonical_action = "remove";
|
||||||
|
}
|
||||||
|
|
||||||
|
let threads_count = await ask(rl, "How many download threads? (default 3): ");
|
||||||
|
threads_count = parseInt(threads_count || "3", 10);
|
||||||
|
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
|
||||||
|
|
||||||
|
const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
|
||||||
|
const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
|
||||||
|
|
||||||
|
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
|
||||||
|
const download_external_assets = /^y(es)?$/i.test(ext);
|
||||||
|
|
||||||
|
rl.close();
|
||||||
|
|
||||||
|
const dl = new WaybackMachineDownloader({
|
||||||
|
base_url,
|
||||||
|
normalized_base: normalizedBase,
|
||||||
|
exact_url,
|
||||||
|
directory: directory || null,
|
||||||
|
from_timestamp: from_timestamp || 0,
|
||||||
|
to_timestamp: to_timestamp || 0,
|
||||||
|
threads_count,
|
||||||
|
rewrite_mode,
|
||||||
|
canonical_action,
|
||||||
|
download_external_assets,
|
||||||
|
});
|
||||||
|
|
||||||
|
await dl.download_files();
|
||||||
|
}
|
||||||
|
|
||||||
|
const isDirectCliRun = (() => {
|
||||||
|
const entryArg = process.argv && process.argv.length > 1 ? process.argv[1] : null;
|
||||||
|
if (!entryArg) return false;
|
||||||
|
try {
|
||||||
|
return import.meta.url === `file://${path.resolve(entryArg)}`;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
if (isDirectCliRun) {
|
||||||
|
interactiveMain().catch((err) => {
|
||||||
|
console.error(`FATAL: ${err?.stack || err}`);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export { interactiveMain };
|
||||||
@@ -7,7 +7,4 @@ RUN npm install --production
|
|||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
CMD ["node", "downloader.js"]
|
ENTRYPOINT ["node", "index.js"]
|
||||||
|
|
||||||
ENTRYPOINT ["node", "downloader.js"]
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,493 +0,0 @@
|
|||||||
/*
|
|
||||||
* Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM)
|
|
||||||
* Run: node downloader.js
|
|
||||||
*/
|
|
||||||
|
|
||||||
import fs from "fs";
|
|
||||||
import path from "path";
|
|
||||||
import { fileURLToPath, pathToFileURL, domainToUnicode } from "url";
|
|
||||||
import { mkdir } from "fs/promises";
|
|
||||||
import pLimit from "p-limit";
|
|
||||||
import { load } from "cheerio";
|
|
||||||
import { Readable } from "stream";
|
|
||||||
import readline from "readline";
|
|
||||||
|
|
||||||
const __filename = fileURLToPath(import.meta.url);
|
|
||||||
const __dirname = path.dirname(__filename);
|
|
||||||
|
|
||||||
// ----------------------------- PROGRESS BAR -----------------------------
|
|
||||||
function renderProgress(current, total) {
|
|
||||||
const width = 40;
|
|
||||||
const ratio = total > 0 ? current / total : 0;
|
|
||||||
const filled = Math.round(ratio * width);
|
|
||||||
const bar = "█".repeat(filled) + "-".repeat(width - filled);
|
|
||||||
process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
|
|
||||||
if (current === total) process.stdout.write("\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------- HELPERS -----------------------------
|
|
||||||
function toPosix(p) {
|
|
||||||
return p.split(path.sep).join("/");
|
|
||||||
}
|
|
||||||
function relativeLink(fromDir, toFile) {
|
|
||||||
const rel = path.relative(fromDir, toFile);
|
|
||||||
return toPosix(rel || path.basename(toFile));
|
|
||||||
}
|
|
||||||
function ensureLocalTargetForPath(pathname) {
|
|
||||||
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
|
|
||||||
? path.posix.join(pathname, "index.html")
|
|
||||||
: pathname;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ----------------------------- HTML CHECK -----------------------------
|
|
||||||
function isHtmlFile(filePath, contentType, firstBytes) {
|
|
||||||
if (contentType && /text\/html/i.test(String(contentType))) return true;
|
|
||||||
const ext = path.extname(filePath).toLowerCase();
|
|
||||||
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
|
|
||||||
const head = (firstBytes || "").toString("utf8", 0, 512);
|
|
||||||
return /<!doctype html/i.test(head) || /<html[\\s>]/i.test(head);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// ----------------------------- Archive API -----------------------------
|
|
||||||
async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
|
|
||||||
const cdx = new URL("https://web.archive.org/cdx/search/xd");
|
|
||||||
const params = new URLSearchParams();
|
|
||||||
params.set("output", "json");
|
|
||||||
params.set("url", baseUrl);
|
|
||||||
params.set("fl", "timestamp,original");
|
|
||||||
params.set("collapse", "digest");
|
|
||||||
params.set("gzip", "false");
|
|
||||||
if (!all) params.append("filter", "statuscode:200");
|
|
||||||
if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
|
|
||||||
if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
|
|
||||||
if (pageIndex != null) params.set("page", String(pageIndex));
|
|
||||||
cdx.search = params.toString();
|
|
||||||
|
|
||||||
try {
|
|
||||||
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
|
|
||||||
const text = await res.text();
|
|
||||||
let json = [];
|
|
||||||
try {
|
|
||||||
json = JSON.parse(text);
|
|
||||||
} catch {
|
|
||||||
// silent: treat as empty page
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
|
|
||||||
json.shift();
|
|
||||||
}
|
|
||||||
return json || [];
|
|
||||||
} catch {
|
|
||||||
// silent: skip broken page
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// ----------------------------- DOWNLOADER CLASS -----------------------------
|
|
||||||
class WaybackMachineDownloader {
|
|
||||||
constructor(params) {
|
|
||||||
this.base_url = params.base_url;
|
|
||||||
this.exact_url = !!params.exact_url;
|
|
||||||
this.directory = params.directory || null;
|
|
||||||
this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
|
|
||||||
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
|
|
||||||
this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
|
|
||||||
|
|
||||||
this.download_external_assets = params.download_external_assets || false;
|
|
||||||
|
|
||||||
this.rewrite_mode = params.rewrite_mode || "as-is";
|
|
||||||
this.rewrite_links = this.rewrite_mode === "relative";
|
|
||||||
this.canonical_action = params.canonical_action || "keep";
|
|
||||||
|
|
||||||
this._processed = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create a human-readable backup folder name, preserving IDNs
|
|
||||||
backup_name() {
|
|
||||||
try {
|
|
||||||
if (this.base_url.includes("//")) {
|
|
||||||
const u = new URL(this.base_url);
|
|
||||||
return domainToUnicode(u.host);
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
return this.base_url;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Resolve output directory
|
|
||||||
backup_path() {
|
|
||||||
if (this.directory) {
|
|
||||||
return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
|
|
||||||
}
|
|
||||||
return path.join("websites", this.backup_name(), path.sep);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fetch and merge snapshot lists
|
|
||||||
async get_all_snapshots_to_consider() {
|
|
||||||
console.log("Getting snapshot pages");
|
|
||||||
const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
|
|
||||||
let list = [];
|
|
||||||
|
|
||||||
list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }));
|
|
||||||
process.stdout.write(".");
|
|
||||||
|
|
||||||
if (!this.exact_url) {
|
|
||||||
const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*";
|
|
||||||
for (let i = 0; i < 100; i++) {
|
|
||||||
const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
|
|
||||||
if (!batch || batch.length === 0) break;
|
|
||||||
list = list.concat(batch);
|
|
||||||
process.stdout.write(".");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
console.log(` found ${list.length} snapshots to consider.\n`);
|
|
||||||
return list;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Choose the latest timestamp per unique pathname
|
|
||||||
async get_file_list_by_timestamp() {
|
|
||||||
const curated = new Map();
|
|
||||||
const all = await this.get_all_snapshots_to_consider();
|
|
||||||
for (const pair of all) {
|
|
||||||
const ts = pair && pair[0];
|
|
||||||
const url = pair && pair[1];
|
|
||||||
if (!ts || !url) continue;
|
|
||||||
try {
|
|
||||||
const u = new URL(url);
|
|
||||||
const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
|
|
||||||
const prev = curated.get(file_id);
|
|
||||||
if (!prev || prev.timestamp <= ts) {
|
|
||||||
curated.set(file_id, { file_url: url, timestamp: ts, file_id });
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
}
|
|
||||||
const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id }));
|
|
||||||
arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
|
|
||||||
return arr;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Replace Windows-hostile characters when running on Windows
|
|
||||||
_windowsSanitize(p) {
|
|
||||||
if (process.platform !== "win32") return p;
|
|
||||||
return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure directory exists
|
|
||||||
async _structure_dir_path(dir_path) {
|
|
||||||
try {
|
|
||||||
await mkdir(dir_path, { recursive: true });
|
|
||||||
} catch (e) {
|
|
||||||
if (!e || e.code !== "EEXIST") throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute local file paths for a given archived URL
|
|
||||||
_determine_paths(file_url, file_id) {
|
|
||||||
if (!file_url || !file_id) return null;
|
|
||||||
if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null;
|
|
||||||
if (file_id.length > 200) return null;
|
|
||||||
|
|
||||||
const backup = this.backup_path();
|
|
||||||
const parts = file_id.split("/").filter(Boolean);
|
|
||||||
let dir_path, file_path;
|
|
||||||
|
|
||||||
if (file_id === "") {
|
|
||||||
dir_path = backup;
|
|
||||||
file_path = path.join(backup, "index.html");
|
|
||||||
} else {
|
|
||||||
const lastPart = parts[parts.length - 1] || "";
|
|
||||||
if (file_url.endsWith("/") || !lastPart.includes(".")) {
|
|
||||||
dir_path = path.join(backup, ...parts);
|
|
||||||
file_path = path.join(dir_path, "index.html");
|
|
||||||
} else {
|
|
||||||
dir_path = path.join(backup, ...parts.slice(0, -1));
|
|
||||||
file_path = path.join(backup, ...parts);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
dir_path = this._windowsSanitize(dir_path);
|
|
||||||
file_path = this._windowsSanitize(file_path);
|
|
||||||
|
|
||||||
return { dir_path, file_path };
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Download a single asset (img/css/js/etc.) referenced from an HTML page
|
|
||||||
async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) {
|
|
||||||
try {
|
|
||||||
if (fs.existsSync(file_path)) return file_path;
|
|
||||||
|
|
||||||
await this._structure_dir_path(dir_path);
|
|
||||||
const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`;
|
|
||||||
let res;
|
|
||||||
try {
|
|
||||||
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
if (!res.ok || !res.body) {
|
|
||||||
console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
await new Promise((resolve, reject) => {
|
|
||||||
const ws = fs.createWriteStream(file_path);
|
|
||||||
Readable.fromWeb(res.body).pipe(ws);
|
|
||||||
ws.on("finish", resolve);
|
|
||||||
ws.on("error", reject);
|
|
||||||
});
|
|
||||||
|
|
||||||
return file_path;
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`Asset download failed: ${assetUrl} → ${e}`);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse saved HTML, optionally rewrite internal links to relative and fetch assets
|
|
||||||
async _process_html_assets(htmlPath, pageUrl, pageTimestamp) {
|
|
||||||
try {
|
|
||||||
const backupRoot = this.backup_path();
|
|
||||||
let html = fs.readFileSync(htmlPath, "utf8");
|
|
||||||
const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is
|
|
||||||
const site = new URL(this.base_url);
|
|
||||||
const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, ""));
|
|
||||||
const baseDir = path.dirname(htmlPath);
|
|
||||||
|
|
||||||
const downloadTasks = [];
|
|
||||||
|
|
||||||
// ----------- ASSETS -----------
|
|
||||||
$("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
|
|
||||||
const attr = el.tagName === "link" ? "href" : "src";
|
|
||||||
const val = $(el).attr(attr);
|
|
||||||
if (!val) return;
|
|
||||||
|
|
||||||
try {
|
|
||||||
const abs = new URL(val, pageUrl).toString();
|
|
||||||
const u = new URL(abs);
|
|
||||||
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
|
|
||||||
|
|
||||||
if (isInternal || this.download_external_assets) {
|
|
||||||
const file_id = decodeURIComponent(u.pathname);
|
|
||||||
let paths;
|
|
||||||
try {
|
|
||||||
paths = this._determine_paths(abs, file_id);
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`Invalid path for asset ${abs}: ${e}`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!paths) return;
|
|
||||||
const { dir_path, file_path } = paths;
|
|
||||||
|
|
||||||
if (this.rewrite_links) {
|
|
||||||
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
|
|
||||||
const localTarget = ensureLocalTargetForPath(normPath);
|
|
||||||
const localAbsPath = path.join(backupRoot, localTarget);
|
|
||||||
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!fs.existsSync(file_path)) {
|
|
||||||
downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
});
|
|
||||||
|
|
||||||
// ----------- INTERNAL LINKS (pages/forms) -----------
|
|
||||||
if (this.rewrite_links) {
|
|
||||||
$("a[href], form[action]").each((_, el) => {
|
|
||||||
const attr = el.tagName === "a" ? "href" : "action";
|
|
||||||
const val = $(el).attr(attr);
|
|
||||||
if (!val) return;
|
|
||||||
|
|
||||||
try {
|
|
||||||
const abs = new URL(val, pageUrl).toString();
|
|
||||||
const u = new URL(abs);
|
|
||||||
const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost;
|
|
||||||
|
|
||||||
if (isInternal) {
|
|
||||||
const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
|
|
||||||
const localTarget = ensureLocalTargetForPath(normPath);
|
|
||||||
const localAbsPath = path.join(backupRoot, localTarget);
|
|
||||||
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
|
||||||
}
|
|
||||||
} catch {}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
await Promise.all(downloadTasks);
|
|
||||||
|
|
||||||
if (this.canonical_action === "remove") {
|
|
||||||
$("link[rel=\"canonical\"]").remove();
|
|
||||||
}
|
|
||||||
|
|
||||||
fs.writeFileSync(htmlPath, $.html(), "utf8");
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`HTML processing error: ${e}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Download one file from the snapshot list (page or asset saved by CDX)
|
|
||||||
async _download_single(file_remote_info, total) {
|
|
||||||
const file_url = String(file_remote_info.file_url);
|
|
||||||
const file_id = file_remote_info.file_id;
|
|
||||||
const file_timestamp = file_remote_info.timestamp;
|
|
||||||
|
|
||||||
let paths;
|
|
||||||
try {
|
|
||||||
paths = this._determine_paths(file_url, file_id);
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`Invalid path for ${file_url}: ${e}`);
|
|
||||||
this._processed++;
|
|
||||||
renderProgress(this._processed, total);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!paths) {
|
|
||||||
console.log(`Skipping invalid URL: ${file_url}`);
|
|
||||||
this._processed++;
|
|
||||||
renderProgress(this._processed, total);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const { dir_path, file_path } = paths;
|
|
||||||
|
|
||||||
if (fs.existsSync(file_path)) {
|
|
||||||
this._processed++;
|
|
||||||
renderProgress(this._processed, total);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
await this._structure_dir_path(dir_path);
|
|
||||||
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
|
|
||||||
let res;
|
|
||||||
try {
|
|
||||||
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`Skipping ${file_url}, fetch failed: ${e}`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!res.ok || !res.body) {
|
|
||||||
console.log(`Skipping ${file_url}, bad response ${res.status}`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
await new Promise((resolve, reject) => {
|
|
||||||
const ws = fs.createWriteStream(file_path);
|
|
||||||
Readable.fromWeb(res.body).pipe(ws);
|
|
||||||
ws.on("finish", resolve);
|
|
||||||
ws.on("error", reject);
|
|
||||||
});
|
|
||||||
|
|
||||||
const contentType = res.headers.get("content-type");
|
|
||||||
const ext = path.extname(file_path).toLowerCase();
|
|
||||||
const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
|
|
||||||
if (looksHtml) {
|
|
||||||
await this._process_html_assets(file_path, file_url, file_timestamp);
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
console.log(`Download failed for ${file_url}: ${e}`);
|
|
||||||
} finally {
|
|
||||||
this._processed++;
|
|
||||||
renderProgress(this._processed, total);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Orchestrate downloads with concurrency
|
|
||||||
async download_files() {
|
|
||||||
const startTime = Date.now();
|
|
||||||
console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
|
|
||||||
const list = await this.get_file_list_by_timestamp();
|
|
||||||
if (list.length === 0) {
|
|
||||||
console.log("No files to download.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
|
|
||||||
const limit = pLimit(concurrency);
|
|
||||||
this._processed = 0;
|
|
||||||
await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
|
|
||||||
const endTime = Date.now();
|
|
||||||
console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// ============================= INTERACTIVE RUN =============================
|
|
||||||
function ask(rl, question) {
|
|
||||||
return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim())));
|
|
||||||
}
|
|
||||||
|
|
||||||
async function interactiveMain() {
|
|
||||||
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
||||||
|
|
||||||
let base_url;
|
|
||||||
while (true) {
|
|
||||||
base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): ");
|
|
||||||
if (!base_url) continue;
|
|
||||||
try {
|
|
||||||
new URL(base_url);
|
|
||||||
break;
|
|
||||||
} catch {
|
|
||||||
console.log("Please enter a valid URL.\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
|
||||||
const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: ");
|
|
||||||
|
|
||||||
let rewrite_mode = "as-is";
|
|
||||||
const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): ");
|
|
||||||
if (/^y(es)?$/i.test(m)) rewrite_mode = "relative";
|
|
||||||
|
|
||||||
let canonical_action = "keep";
|
|
||||||
if (rewrite_mode === "relative") {
|
|
||||||
const c = await ask(rl, 'Canonical: "keep" (default) or "remove": ');
|
|
||||||
if ((c || '').toLowerCase() === "remove") canonical_action = "remove";
|
|
||||||
}
|
|
||||||
|
|
||||||
let threads_count = await ask(rl, "How many download threads? (default 3): ");
|
|
||||||
threads_count = parseInt(threads_count || "3", 10);
|
|
||||||
if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3;
|
|
||||||
|
|
||||||
const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): "));
|
|
||||||
const directory = await ask(rl, "Target directory (leave blank for default websites/<host>/): ");
|
|
||||||
|
|
||||||
const ext = await ask(rl, "Download external assets? (yes/no, default no): ");
|
|
||||||
const download_external_assets = /^y(es)?$/i.test(ext);
|
|
||||||
|
|
||||||
rl.close();
|
|
||||||
|
|
||||||
const dl = new WaybackMachineDownloader({
|
|
||||||
base_url,
|
|
||||||
exact_url,
|
|
||||||
directory: directory || null,
|
|
||||||
from_timestamp: from_timestamp || 0,
|
|
||||||
to_timestamp: to_timestamp || 0,
|
|
||||||
threads_count,
|
|
||||||
rewrite_mode,
|
|
||||||
canonical_action,
|
|
||||||
download_external_assets,
|
|
||||||
});
|
|
||||||
|
|
||||||
await dl.download_files();
|
|
||||||
}
|
|
||||||
|
|
||||||
const isDirectRun =
|
|
||||||
import.meta.url === `file://${process.argv[1]}` ||
|
|
||||||
import.meta.url === pathToFileURL(process.argv[1]).href;
|
|
||||||
|
|
||||||
if (isDirectRun) {
|
|
||||||
interactiveMain().catch((err) => {
|
|
||||||
console.error(`FATAL: ${err?.stack || err}`);
|
|
||||||
process.exit(1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
export { WaybackMachineDownloader };
|
|
||||||
39
wayback-machine-downloader/index.js
Normal file
39
wayback-machine-downloader/index.js
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
/*
|
||||||
|
* Wayback Machine Downloader 0.3.0 by WhitelightSEO
|
||||||
|
* Run: node index.js
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { pathToFileURL } from "url";
|
||||||
|
|
||||||
|
import { setDebugMode, getDebugMode, debugLog } from "./lib/logger.js";
|
||||||
|
import { WaybackMachineDownloader } from "./lib/downloader.js";
|
||||||
|
|
||||||
|
const DEBUG_MODE = false;
|
||||||
|
setDebugMode(DEBUG_MODE);
|
||||||
|
|
||||||
|
const isDirectRun = (() => {
|
||||||
|
const entryArg = process.argv && process.argv.length > 1 ? process.argv[1] : null;
|
||||||
|
if (!entryArg) return false;
|
||||||
|
|
||||||
|
if (import.meta.url === `file://${entryArg}`) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return import.meta.url === pathToFileURL(entryArg).href;
|
||||||
|
} catch (e) {
|
||||||
|
debugLog(`Failed to resolve entry script URL: ${e}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
|
||||||
|
if (isDirectRun) {
|
||||||
|
import("./cli.js")
|
||||||
|
.then(({ interactiveMain }) => interactiveMain())
|
||||||
|
.catch((err) => {
|
||||||
|
console.error(`FATAL: ${err?.stack || err}`);
|
||||||
|
process.exit(1);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export { WaybackMachineDownloader, DEBUG_MODE, setDebugMode, getDebugMode };
|
||||||
392
wayback-machine-downloader/lib/asset-manager.js
Normal file
392
wayback-machine-downloader/lib/asset-manager.js
Normal file
@@ -0,0 +1,392 @@
|
|||||||
|
import fs from "fs";
|
||||||
|
import path from "path";
|
||||||
|
import { mkdir } from "fs/promises";
|
||||||
|
import { load } from "cheerio";
|
||||||
|
import { Readable } from "stream";
|
||||||
|
import { domainToUnicode } from "url";
|
||||||
|
|
||||||
|
import { debugLog } from "./logger.js";
|
||||||
|
import {
|
||||||
|
relativeLink,
|
||||||
|
ensureLocalTargetForPath,
|
||||||
|
isCssResource,
|
||||||
|
} from "./utils.js";
|
||||||
|
|
||||||
|
class AssetManager {
|
||||||
|
constructor({
|
||||||
|
backupPathResolver,
|
||||||
|
rewriteLinks,
|
||||||
|
canonicalAction,
|
||||||
|
downloadExternalAssets,
|
||||||
|
baseHostUnicode,
|
||||||
|
snapshotIndex,
|
||||||
|
}) {
|
||||||
|
this.backupPathResolver = backupPathResolver;
|
||||||
|
this.rewriteLinks = !!rewriteLinks;
|
||||||
|
this.canonicalAction = canonicalAction || "keep";
|
||||||
|
this.downloadExternalAssets = !!downloadExternalAssets;
|
||||||
|
this.baseHostUnicode = (baseHostUnicode || "").toLowerCase();
|
||||||
|
this.snapshotIndex = snapshotIndex || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
setSnapshotIndex(index) {
|
||||||
|
this.snapshotIndex = index;
|
||||||
|
}
|
||||||
|
|
||||||
|
get backupPath() {
|
||||||
|
const resolver = this.backupPathResolver;
|
||||||
|
return typeof resolver === "function" ? resolver() : resolver;
|
||||||
|
}
|
||||||
|
|
||||||
|
windowsSanitize(p) {
|
||||||
|
if (process.platform !== "win32") return p;
|
||||||
|
return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16));
|
||||||
|
}
|
||||||
|
|
||||||
|
async ensureDir(dirPath) {
|
||||||
|
try {
|
||||||
|
await mkdir(dirPath, { recursive: true });
|
||||||
|
} catch (e) {
|
||||||
|
if (!e || e.code !== "EEXIST") throw e;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
determinePaths(fileUrl, fileId) {
|
||||||
|
if (!fileUrl || !fileId) return null;
|
||||||
|
if (fileUrl.startsWith("data:") || fileUrl.startsWith("javascript:")) return null;
|
||||||
|
if (fileId.length > 200) return null;
|
||||||
|
|
||||||
|
const backup = this.backupPath;
|
||||||
|
const parts = fileId.split("/").filter(Boolean);
|
||||||
|
let dirPath;
|
||||||
|
let filePath;
|
||||||
|
|
||||||
|
if (fileId === "") {
|
||||||
|
dirPath = backup;
|
||||||
|
filePath = path.join(backup, "index.html");
|
||||||
|
} else {
|
||||||
|
const lastPart = parts[parts.length - 1] || "";
|
||||||
|
if (fileUrl.endsWith("/") || !lastPart.includes(".")) {
|
||||||
|
dirPath = path.join(backup, ...parts);
|
||||||
|
filePath = path.join(dirPath, "index.html");
|
||||||
|
} else {
|
||||||
|
dirPath = path.join(backup, ...parts.slice(0, -1));
|
||||||
|
filePath = path.join(backup, ...parts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dirPath = this.windowsSanitize(dirPath);
|
||||||
|
filePath = this.windowsSanitize(filePath);
|
||||||
|
|
||||||
|
return { dirPath, filePath };
|
||||||
|
}
|
||||||
|
|
||||||
|
resolveAssetTimestamp(assetUrl, fallbackTimestamp) {
|
||||||
|
if (!this.snapshotIndex) return fallbackTimestamp || 0;
|
||||||
|
return this.snapshotIndex.resolve(assetUrl, fallbackTimestamp);
|
||||||
|
}
|
||||||
|
|
||||||
|
async downloadAsset(assetUrl, pageTimestamp, filePath, dirPath) {
|
||||||
|
try {
|
||||||
|
if (fs.existsSync(filePath)) return filePath;
|
||||||
|
|
||||||
|
await this.ensureDir(dirPath);
|
||||||
|
const assetTimestamp = this.resolveAssetTimestamp(assetUrl, pageTimestamp);
|
||||||
|
if (!assetTimestamp) {
|
||||||
|
debugLog(`Skipping asset ${assetUrl}, no timestamp available in range.`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const snapshotUrl = `https://web.archive.org/web/${assetTimestamp}id_/${assetUrl}`;
|
||||||
|
let res;
|
||||||
|
try {
|
||||||
|
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
||||||
|
} catch (e) {
|
||||||
|
debugLog(`Skipping asset ${assetUrl}, fetch failed: ${e}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (!res.ok || !res.body) {
|
||||||
|
debugLog(`Skipping asset ${assetUrl}, bad response ${res.status}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const contentType = res.headers.get("content-type") || "";
|
||||||
|
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
const ws = fs.createWriteStream(filePath);
|
||||||
|
Readable.fromWeb(res.body).pipe(ws);
|
||||||
|
ws.on("finish", resolve);
|
||||||
|
ws.on("error", reject);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (this.rewriteLinks && isCssResource(filePath, assetUrl, contentType)) {
|
||||||
|
await this.rewriteCssFile(filePath, assetUrl, assetTimestamp);
|
||||||
|
}
|
||||||
|
|
||||||
|
return filePath;
|
||||||
|
} catch (e) {
|
||||||
|
debugLog(`Asset download failed: ${assetUrl} → ${e}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async rewriteCssContent(cssContent, cssSourceUrl, pageTimestamp, { baseDir, excludePath } = {}) {
|
||||||
|
if (!this.rewriteLinks) {
|
||||||
|
return { css: cssContent, downloads: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cssContent || !cssContent.trim()) {
|
||||||
|
return { css: cssContent, downloads: [] };
|
||||||
|
}
|
||||||
|
|
||||||
|
const siteHost = this.baseHostUnicode;
|
||||||
|
const downloads = [];
|
||||||
|
const seenPaths = new Set();
|
||||||
|
let updatedContent = cssContent;
|
||||||
|
let cssChanged = false;
|
||||||
|
|
||||||
|
const processReference = (rawValue) => {
|
||||||
|
if (!rawValue) return null;
|
||||||
|
const trimmed = rawValue.trim();
|
||||||
|
if (!trimmed) return null;
|
||||||
|
if (/^(data:|javascript:|#)/i.test(trimmed)) return null;
|
||||||
|
|
||||||
|
let absoluteUrl;
|
||||||
|
try {
|
||||||
|
absoluteUrl = new URL(trimmed, cssSourceUrl).toString();
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsed;
|
||||||
|
try {
|
||||||
|
parsed = new URL(absoluteUrl);
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (!/^https?:$/i.test(parsed.protocol)) return null;
|
||||||
|
|
||||||
|
const normalizedHost = domainToUnicode(parsed.hostname.replace(/^www\./, "")).toLowerCase();
|
||||||
|
const isInternal = normalizedHost === siteHost;
|
||||||
|
if (!isInternal && !this.downloadExternalAssets) return null;
|
||||||
|
|
||||||
|
let fileId;
|
||||||
|
try {
|
||||||
|
fileId = decodeURIComponent(parsed.pathname);
|
||||||
|
} catch {
|
||||||
|
fileId = parsed.pathname;
|
||||||
|
}
|
||||||
|
let paths;
|
||||||
|
try {
|
||||||
|
paths = this.determinePaths(absoluteUrl, fileId);
|
||||||
|
} catch {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (!paths) return null;
|
||||||
|
|
||||||
|
const { dirPath, filePath } = paths;
|
||||||
|
const assetTimestamp = this.resolveAssetTimestamp(absoluteUrl, pageTimestamp);
|
||||||
|
|
||||||
|
if (
|
||||||
|
filePath &&
|
||||||
|
(!excludePath || path.resolve(filePath) !== path.resolve(excludePath))
|
||||||
|
) {
|
||||||
|
const key = path.resolve(filePath);
|
||||||
|
if (!fs.existsSync(filePath) && !seenPaths.has(key)) {
|
||||||
|
seenPaths.add(key);
|
||||||
|
downloads.push(this.downloadAsset(absoluteUrl, assetTimestamp, filePath, dirPath));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const relativeBase = baseDir || path.dirname(filePath);
|
||||||
|
const relativePath = relativeLink(relativeBase, filePath) + (parsed.hash || "");
|
||||||
|
|
||||||
|
return {
|
||||||
|
original: trimmed,
|
||||||
|
replacement: relativePath,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const urlPattern = /url\(\s*(['"]?)([^'")]+)\1\s*\)/gi;
|
||||||
|
updatedContent = updatedContent.replace(urlPattern, (match, quote, value) => {
|
||||||
|
const info = processReference(value);
|
||||||
|
if (!info) return match;
|
||||||
|
if (info.replacement === info.original) return match;
|
||||||
|
cssChanged = true;
|
||||||
|
const q = quote || "";
|
||||||
|
return `url(${q}${info.replacement}${q})`;
|
||||||
|
});
|
||||||
|
|
||||||
|
const importPattern = /@import\s+(?!url\()\s*(['"])([^'"]+)\1/gi;
|
||||||
|
updatedContent = updatedContent.replace(importPattern, (match, quote, value) => {
|
||||||
|
const info = processReference(value);
|
||||||
|
if (!info) return match;
|
||||||
|
if (info.replacement === info.original) return match;
|
||||||
|
cssChanged = true;
|
||||||
|
return match.replace(value, info.replacement);
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
css: cssChanged && updatedContent !== cssContent ? updatedContent : cssContent,
|
||||||
|
downloads,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async rewriteCssFile(cssPath, cssSourceUrl, pageTimestamp) {
|
||||||
|
if (!this.rewriteLinks) return;
|
||||||
|
|
||||||
|
let cssContent;
|
||||||
|
try {
|
||||||
|
cssContent = fs.readFileSync(cssPath, "utf8");
|
||||||
|
} catch {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const cssDir = path.dirname(cssPath);
|
||||||
|
const { css: updatedContent, downloads } = await this.rewriteCssContent(
|
||||||
|
cssContent,
|
||||||
|
cssSourceUrl,
|
||||||
|
pageTimestamp,
|
||||||
|
{
|
||||||
|
baseDir: cssDir,
|
||||||
|
excludePath: cssPath,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (downloads.length > 0) {
|
||||||
|
await Promise.all(downloads);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (updatedContent !== cssContent) {
|
||||||
|
fs.writeFileSync(cssPath, updatedContent, "utf8");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async processHtml(htmlPath, pageUrl, pageTimestamp) {
|
||||||
|
try {
|
||||||
|
let html = fs.readFileSync(htmlPath, "utf8");
|
||||||
|
const $ = load(html, { decodeEntities: false });
|
||||||
|
const siteHost = this.baseHostUnicode;
|
||||||
|
const baseDir = path.dirname(htmlPath);
|
||||||
|
const backupRoot = this.backupPath;
|
||||||
|
|
||||||
|
const downloadTasks = [];
|
||||||
|
|
||||||
|
const handleCssFragment = async (cssText) => {
|
||||||
|
const { css: updatedCss, downloads } = await this.rewriteCssContent(
|
||||||
|
cssText,
|
||||||
|
pageUrl,
|
||||||
|
pageTimestamp,
|
||||||
|
{ baseDir }
|
||||||
|
);
|
||||||
|
if (downloads.length > 0) {
|
||||||
|
downloadTasks.push(...downloads);
|
||||||
|
}
|
||||||
|
return updatedCss;
|
||||||
|
};
|
||||||
|
|
||||||
|
$("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => {
|
||||||
|
const attr = el.tagName === "link" ? "href" : "src";
|
||||||
|
const val = $(el).attr(attr);
|
||||||
|
if (!val) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const abs = new URL(val, pageUrl).toString();
|
||||||
|
const u = new URL(abs);
|
||||||
|
const normalizedHost = domainToUnicode(u.hostname.replace(/^www\./, "")).toLowerCase();
|
||||||
|
const isInternal = normalizedHost === siteHost;
|
||||||
|
|
||||||
|
if (isInternal || this.downloadExternalAssets) {
|
||||||
|
let fileId;
|
||||||
|
try {
|
||||||
|
fileId = decodeURIComponent(u.pathname);
|
||||||
|
} catch {
|
||||||
|
fileId = u.pathname;
|
||||||
|
}
|
||||||
|
let paths;
|
||||||
|
try {
|
||||||
|
paths = this.determinePaths(abs, fileId);
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`Invalid path for asset ${abs}: ${e}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!paths) return;
|
||||||
|
const { dirPath, filePath } = paths;
|
||||||
|
|
||||||
|
if (this.rewriteLinks) {
|
||||||
|
const normPath = fileId + (u.hash || "");
|
||||||
|
const localTarget = ensureLocalTargetForPath(normPath);
|
||||||
|
const localAbsPath = path.join(backupRoot, localTarget);
|
||||||
|
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fs.existsSync(filePath)) {
|
||||||
|
downloadTasks.push(
|
||||||
|
this.downloadAsset(abs, pageTimestamp, filePath, dirPath)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
});
|
||||||
|
|
||||||
|
const styleNodes = $("style").toArray();
|
||||||
|
for (const node of styleNodes) {
|
||||||
|
const cssText = $(node).html();
|
||||||
|
if (!cssText) continue;
|
||||||
|
const updated = await handleCssFragment(cssText);
|
||||||
|
if (updated !== cssText) {
|
||||||
|
$(node).text(updated);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const inlineStyled = $("[style]").toArray();
|
||||||
|
for (const node of inlineStyled) {
|
||||||
|
const styleAttr = $(node).attr("style");
|
||||||
|
if (!styleAttr) continue;
|
||||||
|
const updated = await handleCssFragment(styleAttr);
|
||||||
|
if (updated !== styleAttr) {
|
||||||
|
$(node).attr("style", updated);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.rewriteLinks) {
|
||||||
|
$("a[href], form[action]").each((_, el) => {
|
||||||
|
const attr = el.tagName === "a" ? "href" : "action";
|
||||||
|
const val = $(el).attr(attr);
|
||||||
|
if (!val) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const abs = new URL(val, pageUrl).toString();
|
||||||
|
const u = new URL(abs);
|
||||||
|
const normalizedHost = domainToUnicode(u.hostname.replace(/^www\./, "")).toLowerCase();
|
||||||
|
const isInternal = normalizedHost === siteHost;
|
||||||
|
|
||||||
|
if (isInternal) {
|
||||||
|
let normPath;
|
||||||
|
try {
|
||||||
|
normPath = decodeURIComponent(u.pathname);
|
||||||
|
} catch {
|
||||||
|
normPath = u.pathname;
|
||||||
|
}
|
||||||
|
normPath += u.hash || "";
|
||||||
|
const localTarget = ensureLocalTargetForPath(normPath);
|
||||||
|
const localAbsPath = path.join(backupRoot, localTarget);
|
||||||
|
$(el).attr(attr, relativeLink(baseDir, localAbsPath));
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
await Promise.all(downloadTasks);
|
||||||
|
|
||||||
|
if (this.canonicalAction === "remove") {
|
||||||
|
$("link[rel=\"canonical\"]").remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
fs.writeFileSync(htmlPath, $.html(), "utf8");
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`HTML processing error: ${e}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export { AssetManager };
|
||||||
222
wayback-machine-downloader/lib/downloader.js
Normal file
222
wayback-machine-downloader/lib/downloader.js
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
import fs from "fs";
|
||||||
|
import path from "path";
|
||||||
|
import { domainToUnicode } from "url";
|
||||||
|
import pLimit from "p-limit";
|
||||||
|
import { Readable } from "stream";
|
||||||
|
|
||||||
|
import { debugLog } from "./logger.js";
|
||||||
|
import { renderProgress, normalizeBaseUrlInput, isHtmlFile, isCssResource } from "./utils.js";
|
||||||
|
import { SnapshotIndex } from "./snapshot-index.js";
|
||||||
|
import { AssetManager } from "./asset-manager.js";
|
||||||
|
|
||||||
|
async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) {
|
||||||
|
const cdx = new URL("https://web.archive.org/cdx/search/xd");
|
||||||
|
const params = new URLSearchParams();
|
||||||
|
params.set("output", "json");
|
||||||
|
params.set("url", baseUrl);
|
||||||
|
params.set("fl", "timestamp,original");
|
||||||
|
params.set("collapse", "digest");
|
||||||
|
params.set("gzip", "false");
|
||||||
|
if (!all) params.append("filter", "statuscode:200");
|
||||||
|
if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp));
|
||||||
|
if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp));
|
||||||
|
if (pageIndex != null) params.set("page", String(pageIndex));
|
||||||
|
cdx.search = params.toString();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" });
|
||||||
|
const text = await res.text();
|
||||||
|
let json = [];
|
||||||
|
try {
|
||||||
|
json = JSON.parse(text);
|
||||||
|
} catch {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") {
|
||||||
|
json.shift();
|
||||||
|
}
|
||||||
|
return json || [];
|
||||||
|
} catch {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class WaybackMachineDownloader {
|
||||||
|
constructor(params) {
|
||||||
|
const normalized = params.normalized_base || normalizeBaseUrlInput(params.base_url);
|
||||||
|
|
||||||
|
this.base_url = normalized.canonicalUrl;
|
||||||
|
this.base_variants = normalized.variants;
|
||||||
|
this.base_host_unicode = (normalized.unicodeHost || normalized.bareHost).toLowerCase();
|
||||||
|
|
||||||
|
this.exact_url = !!params.exact_url;
|
||||||
|
this.directory = params.directory || null;
|
||||||
|
this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0;
|
||||||
|
this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0;
|
||||||
|
this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3;
|
||||||
|
|
||||||
|
this.download_external_assets = params.download_external_assets || false;
|
||||||
|
|
||||||
|
this.rewrite_mode = params.rewrite_mode || "as-is";
|
||||||
|
this.rewrite_links = this.rewrite_mode === "relative";
|
||||||
|
this.canonical_action = params.canonical_action || "keep";
|
||||||
|
|
||||||
|
this._processed = 0;
|
||||||
|
this.snapshotIndex = null;
|
||||||
|
|
||||||
|
this.assetManager = new AssetManager({
|
||||||
|
backupPathResolver: () => this.backup_path(),
|
||||||
|
rewriteLinks: this.rewrite_links,
|
||||||
|
canonicalAction: this.canonical_action,
|
||||||
|
downloadExternalAssets: this.download_external_assets,
|
||||||
|
baseHostUnicode: this.base_host_unicode,
|
||||||
|
snapshotIndex: null,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
backup_name() {
|
||||||
|
try {
|
||||||
|
if (this.base_url.includes("//")) {
|
||||||
|
const u = new URL(this.base_url);
|
||||||
|
return domainToUnicode(u.host);
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
return this.base_url;
|
||||||
|
}
|
||||||
|
|
||||||
|
backup_path() {
|
||||||
|
if (this.directory) {
|
||||||
|
return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep;
|
||||||
|
}
|
||||||
|
return path.join("websites", this.backup_name(), path.sep);
|
||||||
|
}
|
||||||
|
|
||||||
|
async get_all_snapshots_to_consider() {
|
||||||
|
console.log("Getting snapshot pages");
|
||||||
|
const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp };
|
||||||
|
let list = [];
|
||||||
|
const bases = this.base_variants && this.base_variants.length > 0 ? this.base_variants : [this.base_url];
|
||||||
|
|
||||||
|
for (const base of bases) {
|
||||||
|
list = list.concat(await getRawListFromApi({ baseUrl: base, pageIndex: null, ...httpOpts }));
|
||||||
|
process.stdout.write(".");
|
||||||
|
|
||||||
|
if (!this.exact_url) {
|
||||||
|
const wildcard = base.endsWith("/*") ? base : base.replace(/\/*$/, "") + "/*";
|
||||||
|
for (let i = 0; i < 100; i++) {
|
||||||
|
const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts });
|
||||||
|
if (!batch || batch.length === 0) break;
|
||||||
|
list = list.concat(batch);
|
||||||
|
process.stdout.write(".");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log(` found ${list.length} snapshots to consider.\n`);
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
async get_file_list_by_timestamp() {
|
||||||
|
const index = new SnapshotIndex();
|
||||||
|
const all = await this.get_all_snapshots_to_consider();
|
||||||
|
for (const pair of all) {
|
||||||
|
const ts = pair && pair[0];
|
||||||
|
const url = pair && pair[1];
|
||||||
|
if (!ts || !url) continue;
|
||||||
|
index.register(url, ts);
|
||||||
|
}
|
||||||
|
|
||||||
|
const manifest = index.getManifest();
|
||||||
|
this.snapshotIndex = index;
|
||||||
|
this.assetManager.setSnapshotIndex(index);
|
||||||
|
return manifest;
|
||||||
|
}
|
||||||
|
|
||||||
|
async _download_single(file_remote_info, total) {
|
||||||
|
const file_url = String(file_remote_info.file_url);
|
||||||
|
const file_id = file_remote_info.file_id;
|
||||||
|
const file_timestamp = file_remote_info.timestamp;
|
||||||
|
|
||||||
|
let paths;
|
||||||
|
try {
|
||||||
|
paths = this.assetManager.determinePaths(file_url, file_id);
|
||||||
|
} catch (e) {
|
||||||
|
console.log(`Invalid path for ${file_url}: ${e}`);
|
||||||
|
this._processed++;
|
||||||
|
renderProgress(this._processed, total);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!paths) {
|
||||||
|
console.log(`Skipping invalid URL: ${file_url}`);
|
||||||
|
this._processed++;
|
||||||
|
renderProgress(this._processed, total);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const { dirPath, filePath } = paths;
|
||||||
|
|
||||||
|
if (fs.existsSync(filePath)) {
|
||||||
|
this._processed++;
|
||||||
|
renderProgress(this._processed, total);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
await this.assetManager.ensureDir(dirPath);
|
||||||
|
const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`;
|
||||||
|
let res;
|
||||||
|
try {
|
||||||
|
res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" });
|
||||||
|
} catch (e) {
|
||||||
|
debugLog(`Skipping ${file_url}, fetch failed: ${e}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!res.ok || !res.body) {
|
||||||
|
debugLog(`Skipping ${file_url}, bad response ${res.status}`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
const ws = fs.createWriteStream(filePath);
|
||||||
|
Readable.fromWeb(res.body).pipe(ws);
|
||||||
|
ws.on("finish", resolve);
|
||||||
|
ws.on("error", reject);
|
||||||
|
});
|
||||||
|
|
||||||
|
const contentType = res.headers.get("content-type") || "";
|
||||||
|
const ext = path.extname(filePath).toLowerCase();
|
||||||
|
const looksHtml = isHtmlFile(filePath, contentType, null) || ext === "" || ext === ".html" || ext === ".htm";
|
||||||
|
if (this.rewrite_links && isCssResource(filePath, file_url, contentType)) {
|
||||||
|
await this.assetManager.rewriteCssFile(filePath, file_url, file_timestamp);
|
||||||
|
}
|
||||||
|
if (this.rewrite_links && looksHtml) {
|
||||||
|
await this.assetManager.processHtml(filePath, file_url, file_timestamp);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
debugLog(`Download failed for ${file_url}: ${e}`);
|
||||||
|
} finally {
|
||||||
|
this._processed++;
|
||||||
|
renderProgress(this._processed, total);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async download_files() {
|
||||||
|
const startTime = Date.now();
|
||||||
|
console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`);
|
||||||
|
const list = await this.get_file_list_by_timestamp();
|
||||||
|
if (list.length === 0) {
|
||||||
|
console.log("No files to download.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1;
|
||||||
|
const limit = pLimit(concurrency);
|
||||||
|
this._processed = 0;
|
||||||
|
await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length))));
|
||||||
|
const endTime = Date.now();
|
||||||
|
console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export { WaybackMachineDownloader };
|
||||||
21
wayback-machine-downloader/lib/logger.js
Normal file
21
wayback-machine-downloader/lib/logger.js
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
let debugMode = false;
|
||||||
|
|
||||||
|
function setDebugMode(value) {
|
||||||
|
debugMode = !!value;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getDebugMode() {
|
||||||
|
return debugMode;
|
||||||
|
}
|
||||||
|
|
||||||
|
function debugLog(...args) {
|
||||||
|
if (debugMode) {
|
||||||
|
console.log(...args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function infoLog(...args) {
|
||||||
|
console.log(...args);
|
||||||
|
}
|
||||||
|
|
||||||
|
export { setDebugMode, getDebugMode, debugLog, infoLog };
|
||||||
138
wayback-machine-downloader/lib/snapshot-index.js
Normal file
138
wayback-machine-downloader/lib/snapshot-index.js
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
class SnapshotIndex {
|
||||||
|
constructor() {
|
||||||
|
this.byPath = new Map();
|
||||||
|
this.byPathAndQuery = new Map();
|
||||||
|
this.lookupByPath = null;
|
||||||
|
this.lookupByPathAndQuery = null;
|
||||||
|
this.manifestCache = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
register(url, timestamp) {
|
||||||
|
if (!url || !timestamp) return;
|
||||||
|
|
||||||
|
let parsed;
|
||||||
|
try {
|
||||||
|
parsed = new URL(url);
|
||||||
|
} catch {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let filePath;
|
||||||
|
try {
|
||||||
|
filePath = decodeURIComponent(parsed.pathname);
|
||||||
|
} catch {
|
||||||
|
filePath = parsed.pathname;
|
||||||
|
}
|
||||||
|
const search = parsed.search || "";
|
||||||
|
const queryKey = `${filePath}${search}`;
|
||||||
|
|
||||||
|
const normalizedTimestamp = String(timestamp);
|
||||||
|
|
||||||
|
const currentByPath = this.byPath.get(filePath);
|
||||||
|
if (!currentByPath || String(currentByPath.timestamp) <= normalizedTimestamp) {
|
||||||
|
this.byPath.set(filePath, {
|
||||||
|
file_url: url,
|
||||||
|
timestamp: normalizedTimestamp,
|
||||||
|
file_id: filePath,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
const currentByQuery = this.byPathAndQuery.get(queryKey);
|
||||||
|
if (!currentByQuery || String(currentByQuery.timestamp) <= normalizedTimestamp) {
|
||||||
|
this.byPathAndQuery.set(queryKey, {
|
||||||
|
file_url: url,
|
||||||
|
timestamp: normalizedTimestamp,
|
||||||
|
file_id: filePath,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
this.lookupByPath = null;
|
||||||
|
this.lookupByPathAndQuery = null;
|
||||||
|
this.manifestCache = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
buildCaches() {
|
||||||
|
if (this.manifestCache) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const manifest = Array.from(this.byPath.entries()).map(([file_id, value]) => ({
|
||||||
|
...value,
|
||||||
|
file_id,
|
||||||
|
}));
|
||||||
|
|
||||||
|
manifest.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp)));
|
||||||
|
|
||||||
|
const byPath = new Map();
|
||||||
|
const byQuery = new Map();
|
||||||
|
|
||||||
|
for (const entry of manifest) {
|
||||||
|
const { file_url, file_id, timestamp } = entry;
|
||||||
|
if (file_id && timestamp && !byPath.has(file_id)) {
|
||||||
|
byPath.set(file_id, timestamp);
|
||||||
|
}
|
||||||
|
if (file_url) {
|
||||||
|
try {
|
||||||
|
const u = new URL(file_url);
|
||||||
|
let decodedPath;
|
||||||
|
try {
|
||||||
|
decodedPath = decodeURIComponent(u.pathname);
|
||||||
|
} catch {
|
||||||
|
decodedPath = u.pathname;
|
||||||
|
}
|
||||||
|
const pathKey = `${decodedPath}${u.search || ""}`;
|
||||||
|
if (pathKey && timestamp && !byQuery.has(pathKey)) {
|
||||||
|
byQuery.set(pathKey, timestamp);
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [queryKey, entry] of this.byPathAndQuery.entries()) {
|
||||||
|
const ts = entry && entry.timestamp;
|
||||||
|
if (!queryKey || !ts) continue;
|
||||||
|
if (!byQuery.has(queryKey)) {
|
||||||
|
byQuery.set(queryKey, ts);
|
||||||
|
}
|
||||||
|
const basePath = queryKey.replace(/\?.*$/, "");
|
||||||
|
if (basePath && !byPath.has(basePath)) {
|
||||||
|
byPath.set(basePath, ts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
this.manifestCache = manifest;
|
||||||
|
this.lookupByPath = byPath;
|
||||||
|
this.lookupByPathAndQuery = byQuery;
|
||||||
|
}
|
||||||
|
|
||||||
|
getManifest() {
|
||||||
|
this.buildCaches();
|
||||||
|
return this.manifestCache || [];
|
||||||
|
}
|
||||||
|
|
||||||
|
resolve(assetUrl, fallbackTimestamp) {
|
||||||
|
this.buildCaches();
|
||||||
|
let resolved = fallbackTimestamp || 0;
|
||||||
|
if (!assetUrl) return resolved;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const u = new URL(assetUrl);
|
||||||
|
let decodedPath;
|
||||||
|
try {
|
||||||
|
decodedPath = decodeURIComponent(u.pathname);
|
||||||
|
} catch {
|
||||||
|
decodedPath = u.pathname;
|
||||||
|
}
|
||||||
|
const queryKey = `${decodedPath}${u.search || ""}`;
|
||||||
|
if (this.lookupByPathAndQuery && this.lookupByPathAndQuery.has(queryKey)) {
|
||||||
|
resolved = this.lookupByPathAndQuery.get(queryKey);
|
||||||
|
} else if (this.lookupByPath && this.lookupByPath.has(decodedPath)) {
|
||||||
|
resolved = this.lookupByPath.get(decodedPath);
|
||||||
|
}
|
||||||
|
} catch {}
|
||||||
|
|
||||||
|
return resolved;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export { SnapshotIndex };
|
||||||
117
wayback-machine-downloader/lib/utils.js
Normal file
117
wayback-machine-downloader/lib/utils.js
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
import path from "path";
|
||||||
|
import { domainToUnicode } from "url";
|
||||||
|
|
||||||
|
function renderProgress(current, total) {
|
||||||
|
const width = 40;
|
||||||
|
const ratio = total > 0 ? current / total : 0;
|
||||||
|
const filled = Math.round(ratio * width);
|
||||||
|
const bar = "█".repeat(filled) + "-".repeat(width - filled);
|
||||||
|
process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`);
|
||||||
|
if (current === total) process.stdout.write("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
function toPosix(p) {
|
||||||
|
return p.split(path.sep).join("/");
|
||||||
|
}
|
||||||
|
|
||||||
|
function relativeLink(fromDir, toFile) {
|
||||||
|
const rel = path.relative(fromDir, toFile);
|
||||||
|
return toPosix(rel || path.basename(toFile));
|
||||||
|
}
|
||||||
|
|
||||||
|
function ensureLocalTargetForPath(pathname) {
|
||||||
|
return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".")
|
||||||
|
? path.posix.join(pathname, "index.html")
|
||||||
|
: pathname;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeBaseUrlInput(input) {
|
||||||
|
if (!input || typeof input !== "string") {
|
||||||
|
throw new Error("Base URL must be a non-empty string");
|
||||||
|
}
|
||||||
|
|
||||||
|
let raw = input.trim();
|
||||||
|
if (!raw) {
|
||||||
|
throw new Error("Base URL must not be empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!/^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(raw)) {
|
||||||
|
raw = `https://${raw}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsed;
|
||||||
|
try {
|
||||||
|
parsed = new URL(raw);
|
||||||
|
} catch (e) {
|
||||||
|
throw new Error(`Invalid URL: ${e.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!/^https?:$/i.test(parsed.protocol)) {
|
||||||
|
throw new Error("Only http and https protocols are supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
const asciiHost = parsed.hostname.toLowerCase();
|
||||||
|
if (!asciiHost) {
|
||||||
|
throw new Error("URL must contain a hostname");
|
||||||
|
}
|
||||||
|
|
||||||
|
const bareHost = asciiHost.replace(/^www\./, "");
|
||||||
|
const unicodeHost = domainToUnicode(bareHost);
|
||||||
|
const port = parsed.port ? `:${parsed.port}` : "";
|
||||||
|
const basePath = parsed.pathname && parsed.pathname !== "/" ? parsed.pathname.replace(/\/+$/, "") : "";
|
||||||
|
|
||||||
|
const canonicalUrl = `https://${bareHost}${port}${basePath}`;
|
||||||
|
|
||||||
|
const hostSet = new Set([`${bareHost}${port}`]);
|
||||||
|
if (asciiHost !== bareHost) {
|
||||||
|
hostSet.add(`${asciiHost}${port}`);
|
||||||
|
} else if (bareHost && bareHost.includes(".")) {
|
||||||
|
hostSet.add(`www.${bareHost}${port}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const protocols = ["https:", "http:"];
|
||||||
|
const variants = new Set();
|
||||||
|
for (const protocol of protocols) {
|
||||||
|
for (const host of hostSet) {
|
||||||
|
variants.add(`${protocol}//${host}${basePath}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
canonicalUrl,
|
||||||
|
variants: Array.from(variants),
|
||||||
|
bareHost,
|
||||||
|
unicodeHost,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function isHtmlFile(filePath, contentType, firstBytes) {
|
||||||
|
if (contentType && /text\/html/i.test(String(contentType))) return true;
|
||||||
|
const ext = path.extname(filePath).toLowerCase();
|
||||||
|
if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true;
|
||||||
|
const head = (firstBytes || "").toString("utf8", 0, 512);
|
||||||
|
return /<!doctype html/i.test(head) || /<html[\s>]/i.test(head);
|
||||||
|
}
|
||||||
|
|
||||||
|
function isCssResource(filePath, resourceUrl, contentType) {
|
||||||
|
const ext = path.extname(filePath || "").toLowerCase();
|
||||||
|
if (ext === ".css") return true;
|
||||||
|
if (contentType && /text\/css/i.test(String(contentType))) return true;
|
||||||
|
if (resourceUrl) {
|
||||||
|
try {
|
||||||
|
const u = new URL(resourceUrl);
|
||||||
|
if (/\.css(?:$|\?)/i.test(u.pathname)) return true;
|
||||||
|
} catch {}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
export {
|
||||||
|
renderProgress,
|
||||||
|
toPosix,
|
||||||
|
relativeLink,
|
||||||
|
ensureLocalTargetForPath,
|
||||||
|
normalizeBaseUrlInput,
|
||||||
|
isHtmlFile,
|
||||||
|
isCssResource,
|
||||||
|
};
|
||||||
@@ -1,14 +1,22 @@
|
|||||||
{
|
{
|
||||||
"name": "wayback-downloader",
|
"name": "wayback-machine-downloader",
|
||||||
"version": "0.2.1",
|
"version": "0.2.1",
|
||||||
"description": "Interactive Wayback Machine downloader for archiving websites locally.",
|
"description": "Interactive Wayback Machine downloader for archiving websites locally.",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"main": "downloader.js",
|
"main": "./index.js",
|
||||||
|
"exports": {
|
||||||
|
".": "./index.js",
|
||||||
|
"./downloader": "./lib/downloader.js",
|
||||||
|
"./downloader.js": "./lib/downloader.js",
|
||||||
|
"./cli": "./cli.js",
|
||||||
|
"./package.json": "./package.json"
|
||||||
|
},
|
||||||
"bin": {
|
"bin": {
|
||||||
"wayback-downloader": "downloader.js"
|
"wayback-machine-downloader": "./cli.js"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "node downloader.js"
|
"start": "node cli.js",
|
||||||
|
"download": "node cli.js"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"cheerio": "^1.0.0-rc.12",
|
"cheerio": "^1.0.0-rc.12",
|
||||||
@@ -17,19 +25,25 @@
|
|||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=18"
|
"node": ">=18"
|
||||||
},
|
},
|
||||||
|
"files": [
|
||||||
|
"cli.js",
|
||||||
|
"index.js",
|
||||||
|
"lib"
|
||||||
|
],
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"wayback-machine-downloader",
|
"wayback",
|
||||||
"web-archive-downloder",
|
"archive",
|
||||||
"archiver"
|
"downloader",
|
||||||
|
"wayback-machine"
|
||||||
],
|
],
|
||||||
"author": "birbwatcher",
|
"author": "birbwatcher",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"repository": {
|
"repository": {
|
||||||
"type": "git",
|
"type": "git",
|
||||||
"url": "https://github.com/birbwatcher/wayback-downloader.git"
|
"url": "https://github.com/birbwatcher/wayback-machine-downloader.git"
|
||||||
},
|
},
|
||||||
"bugs": {
|
"bugs": {
|
||||||
"url": "https://github.com/birbwatcher/wayback-downloader/issues"
|
"url": "https://github.com/birbwatcher/wayback-machine-downloader/issues"
|
||||||
},
|
},
|
||||||
"homepage": "https://github.com/birbwatcher/wayback-downloader#readme"
|
"homepage": "https://github.com/birbwatcher/wayback-machine-downloader#readme"
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user