From 4629ccd7247b8c29bdf0e59bfc11bc93949ef429 Mon Sep 17 00:00:00 2001 From: User Date: Sun, 26 Oct 2025 10:11:56 +0100 Subject: [PATCH] feat: splitted into files and made some fixes --- README.md | 2 +- wayback-machine-downloader/cli.js | 88 ++++ wayback-machine-downloader/dockerfile | 5 +- wayback-machine-downloader/downloader.js | 493 ------------------ wayback-machine-downloader/index.js | 39 ++ .../lib/asset-manager.js | 392 ++++++++++++++ wayback-machine-downloader/lib/downloader.js | 222 ++++++++ wayback-machine-downloader/lib/logger.js | 21 + .../lib/snapshot-index.js | 138 +++++ wayback-machine-downloader/lib/utils.js | 117 +++++ wayback-machine-downloader/package.json | 34 +- 11 files changed, 1043 insertions(+), 508 deletions(-) create mode 100644 wayback-machine-downloader/cli.js delete mode 100644 wayback-machine-downloader/downloader.js create mode 100644 wayback-machine-downloader/index.js create mode 100644 wayback-machine-downloader/lib/asset-manager.js create mode 100644 wayback-machine-downloader/lib/downloader.js create mode 100644 wayback-machine-downloader/lib/logger.js create mode 100644 wayback-machine-downloader/lib/snapshot-index.js create mode 100644 wayback-machine-downloader/lib/utils.js diff --git a/README.md b/README.md index 8f88f12..568dfb1 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ Got ideas or suggestions? Feel free to open an issue! ## Run ```bash -node downloader.js +node index.js ``` After launching, an interactive menu will appear with the following questions: diff --git a/wayback-machine-downloader/cli.js b/wayback-machine-downloader/cli.js new file mode 100644 index 0000000..65a78aa --- /dev/null +++ b/wayback-machine-downloader/cli.js @@ -0,0 +1,88 @@ +#!/usr/bin/env node + +import path from "path"; +import readline from "readline"; + +import { WaybackMachineDownloader } from "./lib/downloader.js"; +import { normalizeBaseUrlInput } from "./lib/utils.js"; + +function ask(rl, question) { + return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim()))); +} + +async function interactiveMain() { + const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); + + let normalizedBase; + while (true) { + const baseInput = await ask(rl, "Enter domain or URL to archive (e.g., example.com): "); + if (!baseInput) continue; + try { + normalizedBase = normalizeBaseUrlInput(baseInput); + break; + } catch { + console.log("Please enter a valid domain or URL.\n"); + } + } + + const base_url = normalizedBase.canonicalUrl; + + const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: "); + const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: "); + + let rewrite_mode = "as-is"; + const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): "); + if (/^y(es)?$/i.test(m)) rewrite_mode = "relative"; + + let canonical_action = "keep"; + if (rewrite_mode === "relative") { + const c = await ask(rl, 'Canonical: "keep" (default) or "remove": '); + if ((c || "").toLowerCase() === "remove") canonical_action = "remove"; + } + + let threads_count = await ask(rl, "How many download threads? (default 3): "); + threads_count = parseInt(threads_count || "3", 10); + if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3; + + const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")); + const directory = await ask(rl, "Target directory (leave blank for default websites//): "); + + const ext = await ask(rl, "Download external assets? (yes/no, default no): "); + const download_external_assets = /^y(es)?$/i.test(ext); + + rl.close(); + + const dl = new WaybackMachineDownloader({ + base_url, + normalized_base: normalizedBase, + exact_url, + directory: directory || null, + from_timestamp: from_timestamp || 0, + to_timestamp: to_timestamp || 0, + threads_count, + rewrite_mode, + canonical_action, + download_external_assets, + }); + + await dl.download_files(); +} + +const isDirectCliRun = (() => { + const entryArg = process.argv && process.argv.length > 1 ? process.argv[1] : null; + if (!entryArg) return false; + try { + return import.meta.url === `file://${path.resolve(entryArg)}`; + } catch { + return false; + } +})(); + +if (isDirectCliRun) { + interactiveMain().catch((err) => { + console.error(`FATAL: ${err?.stack || err}`); + process.exit(1); + }); +} + +export { interactiveMain }; \ No newline at end of file diff --git a/wayback-machine-downloader/dockerfile b/wayback-machine-downloader/dockerfile index 3a681f6..05d1b91 100644 --- a/wayback-machine-downloader/dockerfile +++ b/wayback-machine-downloader/dockerfile @@ -7,7 +7,4 @@ RUN npm install --production COPY . . -CMD ["node", "downloader.js"] - -ENTRYPOINT ["node", "downloader.js"] - +ENTRYPOINT ["node", "index.js"] diff --git a/wayback-machine-downloader/downloader.js b/wayback-machine-downloader/downloader.js deleted file mode 100644 index ca81597..0000000 --- a/wayback-machine-downloader/downloader.js +++ /dev/null @@ -1,493 +0,0 @@ -/* - * Wayback Machine Downloader 0.2.1 by WhitelightSEO — Interactive (Node.js, ESM) - * Run: node downloader.js - */ - -import fs from "fs"; -import path from "path"; -import { fileURLToPath, pathToFileURL, domainToUnicode } from "url"; -import { mkdir } from "fs/promises"; -import pLimit from "p-limit"; -import { load } from "cheerio"; -import { Readable } from "stream"; -import readline from "readline"; - -const __filename = fileURLToPath(import.meta.url); -const __dirname = path.dirname(__filename); - -// ----------------------------- PROGRESS BAR ----------------------------- -function renderProgress(current, total) { - const width = 40; - const ratio = total > 0 ? current / total : 0; - const filled = Math.round(ratio * width); - const bar = "█".repeat(filled) + "-".repeat(width - filled); - process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`); - if (current === total) process.stdout.write("\n"); -} - -// ----------------------------- HELPERS ----------------------------- -function toPosix(p) { - return p.split(path.sep).join("/"); -} -function relativeLink(fromDir, toFile) { - const rel = path.relative(fromDir, toFile); - return toPosix(rel || path.basename(toFile)); -} -function ensureLocalTargetForPath(pathname) { - return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".") - ? path.posix.join(pathname, "index.html") - : pathname; -} - -// ----------------------------- HTML CHECK ----------------------------- -function isHtmlFile(filePath, contentType, firstBytes) { - if (contentType && /text\/html/i.test(String(contentType))) return true; - const ext = path.extname(filePath).toLowerCase(); - if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true; - const head = (firstBytes || "").toString("utf8", 0, 512); - return /]/i.test(head); -} - - -// ----------------------------- Archive API ----------------------------- -async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) { - const cdx = new URL("https://web.archive.org/cdx/search/xd"); - const params = new URLSearchParams(); - params.set("output", "json"); - params.set("url", baseUrl); - params.set("fl", "timestamp,original"); - params.set("collapse", "digest"); - params.set("gzip", "false"); - if (!all) params.append("filter", "statuscode:200"); - if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp)); - if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp)); - if (pageIndex != null) params.set("page", String(pageIndex)); - cdx.search = params.toString(); - - try { - const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" }); - const text = await res.text(); - let json = []; - try { - json = JSON.parse(text); - } catch { - // silent: treat as empty page - return []; - } - if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") { - json.shift(); - } - return json || []; - } catch { - // silent: skip broken page - return []; - } -} - - -// ----------------------------- DOWNLOADER CLASS ----------------------------- -class WaybackMachineDownloader { - constructor(params) { - this.base_url = params.base_url; - this.exact_url = !!params.exact_url; - this.directory = params.directory || null; - this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0; - this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0; - this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3; - - this.download_external_assets = params.download_external_assets || false; - - this.rewrite_mode = params.rewrite_mode || "as-is"; - this.rewrite_links = this.rewrite_mode === "relative"; - this.canonical_action = params.canonical_action || "keep"; - - this._processed = 0; - } - - // Create a human-readable backup folder name, preserving IDNs - backup_name() { - try { - if (this.base_url.includes("//")) { - const u = new URL(this.base_url); - return domainToUnicode(u.host); - } - } catch {} - return this.base_url; - } - - // Resolve output directory - backup_path() { - if (this.directory) { - return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep; - } - return path.join("websites", this.backup_name(), path.sep); - } - - // Fetch and merge snapshot lists - async get_all_snapshots_to_consider() { - console.log("Getting snapshot pages"); - const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp }; - let list = []; - - list = list.concat(await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts })); - process.stdout.write("."); - - if (!this.exact_url) { - const wildcard = this.base_url.endsWith("/*") ? this.base_url : this.base_url.replace(/\/*$/, "") + "/*"; - for (let i = 0; i < 100; i++) { - const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts }); - if (!batch || batch.length === 0) break; - list = list.concat(batch); - process.stdout.write("."); - } - } - console.log(` found ${list.length} snapshots to consider.\n`); - return list; - } - - - // Choose the latest timestamp per unique pathname - async get_file_list_by_timestamp() { - const curated = new Map(); - const all = await this.get_all_snapshots_to_consider(); - for (const pair of all) { - const ts = pair && pair[0]; - const url = pair && pair[1]; - if (!ts || !url) continue; - try { - const u = new URL(url); - const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths - const prev = curated.get(file_id); - if (!prev || prev.timestamp <= ts) { - curated.set(file_id, { file_url: url, timestamp: ts, file_id }); - } - } catch {} - } - const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id })); - arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp))); - return arr; - } - - // Replace Windows-hostile characters when running on Windows - _windowsSanitize(p) { - if (process.platform !== "win32") return p; - return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16)); - } - - // Ensure directory exists - async _structure_dir_path(dir_path) { - try { - await mkdir(dir_path, { recursive: true }); - } catch (e) { - if (!e || e.code !== "EEXIST") throw e; - } - } - - // Compute local file paths for a given archived URL - _determine_paths(file_url, file_id) { - if (!file_url || !file_id) return null; - if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) return null; - if (file_id.length > 200) return null; - - const backup = this.backup_path(); - const parts = file_id.split("/").filter(Boolean); - let dir_path, file_path; - - if (file_id === "") { - dir_path = backup; - file_path = path.join(backup, "index.html"); - } else { - const lastPart = parts[parts.length - 1] || ""; - if (file_url.endsWith("/") || !lastPart.includes(".")) { - dir_path = path.join(backup, ...parts); - file_path = path.join(dir_path, "index.html"); - } else { - dir_path = path.join(backup, ...parts.slice(0, -1)); - file_path = path.join(backup, ...parts); - } - } - - dir_path = this._windowsSanitize(dir_path); - file_path = this._windowsSanitize(file_path); - - return { dir_path, file_path }; - } - - - // Download a single asset (img/css/js/etc.) referenced from an HTML page - async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) { - try { - if (fs.existsSync(file_path)) return file_path; - - await this._structure_dir_path(dir_path); - const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`; - let res; - try { - res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" }); - } catch (e) { - console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`); - return null; - } - if (!res.ok || !res.body) { - console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`); - return null; - } - - await new Promise((resolve, reject) => { - const ws = fs.createWriteStream(file_path); - Readable.fromWeb(res.body).pipe(ws); - ws.on("finish", resolve); - ws.on("error", reject); - }); - - return file_path; - } catch (e) { - console.log(`Asset download failed: ${assetUrl} → ${e}`); - return null; - } - } - - // Parse saved HTML, optionally rewrite internal links to relative and fetch assets - async _process_html_assets(htmlPath, pageUrl, pageTimestamp) { - try { - const backupRoot = this.backup_path(); - let html = fs.readFileSync(htmlPath, "utf8"); - const $ = load(html, { decodeEntities: false }); // keep emojis & non-ASCII as-is - const site = new URL(this.base_url); - const siteHost = domainToUnicode(site.hostname.replace(/^www\\./, "")); - const baseDir = path.dirname(htmlPath); - - const downloadTasks = []; - - // ----------- ASSETS ----------- - $("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => { - const attr = el.tagName === "link" ? "href" : "src"; - const val = $(el).attr(attr); - if (!val) return; - - try { - const abs = new URL(val, pageUrl).toString(); - const u = new URL(abs); - const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost; - - if (isInternal || this.download_external_assets) { - const file_id = decodeURIComponent(u.pathname); - let paths; - try { - paths = this._determine_paths(abs, file_id); - } catch (e) { - console.log(`Invalid path for asset ${abs}: ${e}`); - return; - } - if (!paths) return; - const { dir_path, file_path } = paths; - - if (this.rewrite_links) { - const normPath = decodeURIComponent(u.pathname) + (u.hash || ""); - const localTarget = ensureLocalTargetForPath(normPath); - const localAbsPath = path.join(backupRoot, localTarget); - $(el).attr(attr, relativeLink(baseDir, localAbsPath)); - } - - if (!fs.existsSync(file_path)) { - downloadTasks.push(this._download_asset(abs, pageTimestamp, file_path, dir_path)); - } - } - } catch {} - }); - - // ----------- INTERNAL LINKS (pages/forms) ----------- - if (this.rewrite_links) { - $("a[href], form[action]").each((_, el) => { - const attr = el.tagName === "a" ? "href" : "action"; - const val = $(el).attr(attr); - if (!val) return; - - try { - const abs = new URL(val, pageUrl).toString(); - const u = new URL(abs); - const isInternal = domainToUnicode(u.hostname.replace(/^www\\./, "")) === siteHost; - - if (isInternal) { - const normPath = decodeURIComponent(u.pathname) + (u.hash || ""); - const localTarget = ensureLocalTargetForPath(normPath); - const localAbsPath = path.join(backupRoot, localTarget); - $(el).attr(attr, relativeLink(baseDir, localAbsPath)); - } - } catch {} - }); - } - - await Promise.all(downloadTasks); - - if (this.canonical_action === "remove") { - $("link[rel=\"canonical\"]").remove(); - } - - fs.writeFileSync(htmlPath, $.html(), "utf8"); - } catch (e) { - console.log(`HTML processing error: ${e}`); - } - } - - - // Download one file from the snapshot list (page or asset saved by CDX) - async _download_single(file_remote_info, total) { - const file_url = String(file_remote_info.file_url); - const file_id = file_remote_info.file_id; - const file_timestamp = file_remote_info.timestamp; - - let paths; - try { - paths = this._determine_paths(file_url, file_id); - } catch (e) { - console.log(`Invalid path for ${file_url}: ${e}`); - this._processed++; - renderProgress(this._processed, total); - return; - } - - if (!paths) { - console.log(`Skipping invalid URL: ${file_url}`); - this._processed++; - renderProgress(this._processed, total); - return; - } - - const { dir_path, file_path } = paths; - - if (fs.existsSync(file_path)) { - this._processed++; - renderProgress(this._processed, total); - return; - } - - try { - await this._structure_dir_path(dir_path); - const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`; - let res; - try { - res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" }); - } catch (e) { - console.log(`Skipping ${file_url}, fetch failed: ${e}`); - return; - } - - if (!res.ok || !res.body) { - console.log(`Skipping ${file_url}, bad response ${res.status}`); - return; - } - - await new Promise((resolve, reject) => { - const ws = fs.createWriteStream(file_path); - Readable.fromWeb(res.body).pipe(ws); - ws.on("finish", resolve); - ws.on("error", reject); - }); - - const contentType = res.headers.get("content-type"); - const ext = path.extname(file_path).toLowerCase(); - const looksHtml = isHtmlFile(file_path, contentType, null) || ext === "" || ext === ".html" || ext === ".htm"; - if (looksHtml) { - await this._process_html_assets(file_path, file_url, file_timestamp); - } - } catch (e) { - console.log(`Download failed for ${file_url}: ${e}`); - } finally { - this._processed++; - renderProgress(this._processed, total); - } - } - - // Orchestrate downloads with concurrency - async download_files() { - const startTime = Date.now(); - console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`); - const list = await this.get_file_list_by_timestamp(); - if (list.length === 0) { - console.log("No files to download."); - return; - } - - const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1; - const limit = pLimit(concurrency); - this._processed = 0; - await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length)))); - const endTime = Date.now(); - console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`); - } -} - - -// ============================= INTERACTIVE RUN ============================= -function ask(rl, question) { - return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim()))); -} - -async function interactiveMain() { - const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); - - let base_url; - while (true) { - base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): "); - if (!base_url) continue; - try { - new URL(base_url); - break; - } catch { - console.log("Please enter a valid URL.\n"); - } - } - - const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: "); - const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: "); - - let rewrite_mode = "as-is"; - const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): "); - if (/^y(es)?$/i.test(m)) rewrite_mode = "relative"; - - let canonical_action = "keep"; - if (rewrite_mode === "relative") { - const c = await ask(rl, 'Canonical: "keep" (default) or "remove": '); - if ((c || '').toLowerCase() === "remove") canonical_action = "remove"; - } - - let threads_count = await ask(rl, "How many download threads? (default 3): "); - threads_count = parseInt(threads_count || "3", 10); - if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3; - - const exact_url = /^y(es)?$/i.test(await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ")); - const directory = await ask(rl, "Target directory (leave blank for default websites//): "); - - const ext = await ask(rl, "Download external assets? (yes/no, default no): "); - const download_external_assets = /^y(es)?$/i.test(ext); - - rl.close(); - - const dl = new WaybackMachineDownloader({ - base_url, - exact_url, - directory: directory || null, - from_timestamp: from_timestamp || 0, - to_timestamp: to_timestamp || 0, - threads_count, - rewrite_mode, - canonical_action, - download_external_assets, - }); - - await dl.download_files(); -} - -const isDirectRun = - import.meta.url === `file://${process.argv[1]}` || - import.meta.url === pathToFileURL(process.argv[1]).href; - -if (isDirectRun) { - interactiveMain().catch((err) => { - console.error(`FATAL: ${err?.stack || err}`); - process.exit(1); - }); -} - -export { WaybackMachineDownloader }; diff --git a/wayback-machine-downloader/index.js b/wayback-machine-downloader/index.js new file mode 100644 index 0000000..db65769 --- /dev/null +++ b/wayback-machine-downloader/index.js @@ -0,0 +1,39 @@ +/* + * Wayback Machine Downloader 0.3.0 by WhitelightSEO + * Run: node index.js + */ + +import { pathToFileURL } from "url"; + +import { setDebugMode, getDebugMode, debugLog } from "./lib/logger.js"; +import { WaybackMachineDownloader } from "./lib/downloader.js"; + +const DEBUG_MODE = false; +setDebugMode(DEBUG_MODE); + +const isDirectRun = (() => { + const entryArg = process.argv && process.argv.length > 1 ? process.argv[1] : null; + if (!entryArg) return false; + + if (import.meta.url === `file://${entryArg}`) { + return true; + } + + try { + return import.meta.url === pathToFileURL(entryArg).href; + } catch (e) { + debugLog(`Failed to resolve entry script URL: ${e}`); + return false; + } +})(); + +if (isDirectRun) { + import("./cli.js") + .then(({ interactiveMain }) => interactiveMain()) + .catch((err) => { + console.error(`FATAL: ${err?.stack || err}`); + process.exit(1); + }); +} + +export { WaybackMachineDownloader, DEBUG_MODE, setDebugMode, getDebugMode }; \ No newline at end of file diff --git a/wayback-machine-downloader/lib/asset-manager.js b/wayback-machine-downloader/lib/asset-manager.js new file mode 100644 index 0000000..7d931ee --- /dev/null +++ b/wayback-machine-downloader/lib/asset-manager.js @@ -0,0 +1,392 @@ +import fs from "fs"; +import path from "path"; +import { mkdir } from "fs/promises"; +import { load } from "cheerio"; +import { Readable } from "stream"; +import { domainToUnicode } from "url"; + +import { debugLog } from "./logger.js"; +import { + relativeLink, + ensureLocalTargetForPath, + isCssResource, +} from "./utils.js"; + +class AssetManager { + constructor({ + backupPathResolver, + rewriteLinks, + canonicalAction, + downloadExternalAssets, + baseHostUnicode, + snapshotIndex, + }) { + this.backupPathResolver = backupPathResolver; + this.rewriteLinks = !!rewriteLinks; + this.canonicalAction = canonicalAction || "keep"; + this.downloadExternalAssets = !!downloadExternalAssets; + this.baseHostUnicode = (baseHostUnicode || "").toLowerCase(); + this.snapshotIndex = snapshotIndex || null; + } + + setSnapshotIndex(index) { + this.snapshotIndex = index; + } + + get backupPath() { + const resolver = this.backupPathResolver; + return typeof resolver === "function" ? resolver() : resolver; + } + + windowsSanitize(p) { + if (process.platform !== "win32") return p; + return p.replace(/[:*?&=<>\\|]/g, (s) => "%" + s.charCodeAt(0).toString(16)); + } + + async ensureDir(dirPath) { + try { + await mkdir(dirPath, { recursive: true }); + } catch (e) { + if (!e || e.code !== "EEXIST") throw e; + } + } + + determinePaths(fileUrl, fileId) { + if (!fileUrl || !fileId) return null; + if (fileUrl.startsWith("data:") || fileUrl.startsWith("javascript:")) return null; + if (fileId.length > 200) return null; + + const backup = this.backupPath; + const parts = fileId.split("/").filter(Boolean); + let dirPath; + let filePath; + + if (fileId === "") { + dirPath = backup; + filePath = path.join(backup, "index.html"); + } else { + const lastPart = parts[parts.length - 1] || ""; + if (fileUrl.endsWith("/") || !lastPart.includes(".")) { + dirPath = path.join(backup, ...parts); + filePath = path.join(dirPath, "index.html"); + } else { + dirPath = path.join(backup, ...parts.slice(0, -1)); + filePath = path.join(backup, ...parts); + } + } + + dirPath = this.windowsSanitize(dirPath); + filePath = this.windowsSanitize(filePath); + + return { dirPath, filePath }; + } + + resolveAssetTimestamp(assetUrl, fallbackTimestamp) { + if (!this.snapshotIndex) return fallbackTimestamp || 0; + return this.snapshotIndex.resolve(assetUrl, fallbackTimestamp); + } + + async downloadAsset(assetUrl, pageTimestamp, filePath, dirPath) { + try { + if (fs.existsSync(filePath)) return filePath; + + await this.ensureDir(dirPath); + const assetTimestamp = this.resolveAssetTimestamp(assetUrl, pageTimestamp); + if (!assetTimestamp) { + debugLog(`Skipping asset ${assetUrl}, no timestamp available in range.`); + return null; + } + const snapshotUrl = `https://web.archive.org/web/${assetTimestamp}id_/${assetUrl}`; + let res; + try { + res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" }); + } catch (e) { + debugLog(`Skipping asset ${assetUrl}, fetch failed: ${e}`); + return null; + } + if (!res.ok || !res.body) { + debugLog(`Skipping asset ${assetUrl}, bad response ${res.status}`); + return null; + } + + const contentType = res.headers.get("content-type") || ""; + + await new Promise((resolve, reject) => { + const ws = fs.createWriteStream(filePath); + Readable.fromWeb(res.body).pipe(ws); + ws.on("finish", resolve); + ws.on("error", reject); + }); + + if (this.rewriteLinks && isCssResource(filePath, assetUrl, contentType)) { + await this.rewriteCssFile(filePath, assetUrl, assetTimestamp); + } + + return filePath; + } catch (e) { + debugLog(`Asset download failed: ${assetUrl} → ${e}`); + return null; + } + } + + async rewriteCssContent(cssContent, cssSourceUrl, pageTimestamp, { baseDir, excludePath } = {}) { + if (!this.rewriteLinks) { + return { css: cssContent, downloads: [] }; + } + + if (!cssContent || !cssContent.trim()) { + return { css: cssContent, downloads: [] }; + } + + const siteHost = this.baseHostUnicode; + const downloads = []; + const seenPaths = new Set(); + let updatedContent = cssContent; + let cssChanged = false; + + const processReference = (rawValue) => { + if (!rawValue) return null; + const trimmed = rawValue.trim(); + if (!trimmed) return null; + if (/^(data:|javascript:|#)/i.test(trimmed)) return null; + + let absoluteUrl; + try { + absoluteUrl = new URL(trimmed, cssSourceUrl).toString(); + } catch { + return null; + } + + let parsed; + try { + parsed = new URL(absoluteUrl); + } catch { + return null; + } + if (!/^https?:$/i.test(parsed.protocol)) return null; + + const normalizedHost = domainToUnicode(parsed.hostname.replace(/^www\./, "")).toLowerCase(); + const isInternal = normalizedHost === siteHost; + if (!isInternal && !this.downloadExternalAssets) return null; + + let fileId; + try { + fileId = decodeURIComponent(parsed.pathname); + } catch { + fileId = parsed.pathname; + } + let paths; + try { + paths = this.determinePaths(absoluteUrl, fileId); + } catch { + return null; + } + if (!paths) return null; + + const { dirPath, filePath } = paths; + const assetTimestamp = this.resolveAssetTimestamp(absoluteUrl, pageTimestamp); + + if ( + filePath && + (!excludePath || path.resolve(filePath) !== path.resolve(excludePath)) + ) { + const key = path.resolve(filePath); + if (!fs.existsSync(filePath) && !seenPaths.has(key)) { + seenPaths.add(key); + downloads.push(this.downloadAsset(absoluteUrl, assetTimestamp, filePath, dirPath)); + } + } + + const relativeBase = baseDir || path.dirname(filePath); + const relativePath = relativeLink(relativeBase, filePath) + (parsed.hash || ""); + + return { + original: trimmed, + replacement: relativePath, + }; + }; + + const urlPattern = /url\(\s*(['"]?)([^'")]+)\1\s*\)/gi; + updatedContent = updatedContent.replace(urlPattern, (match, quote, value) => { + const info = processReference(value); + if (!info) return match; + if (info.replacement === info.original) return match; + cssChanged = true; + const q = quote || ""; + return `url(${q}${info.replacement}${q})`; + }); + + const importPattern = /@import\s+(?!url\()\s*(['"])([^'"]+)\1/gi; + updatedContent = updatedContent.replace(importPattern, (match, quote, value) => { + const info = processReference(value); + if (!info) return match; + if (info.replacement === info.original) return match; + cssChanged = true; + return match.replace(value, info.replacement); + }); + + return { + css: cssChanged && updatedContent !== cssContent ? updatedContent : cssContent, + downloads, + }; + } + + async rewriteCssFile(cssPath, cssSourceUrl, pageTimestamp) { + if (!this.rewriteLinks) return; + + let cssContent; + try { + cssContent = fs.readFileSync(cssPath, "utf8"); + } catch { + return; + } + + const cssDir = path.dirname(cssPath); + const { css: updatedContent, downloads } = await this.rewriteCssContent( + cssContent, + cssSourceUrl, + pageTimestamp, + { + baseDir: cssDir, + excludePath: cssPath, + } + ); + + if (downloads.length > 0) { + await Promise.all(downloads); + } + + if (updatedContent !== cssContent) { + fs.writeFileSync(cssPath, updatedContent, "utf8"); + } + } + + async processHtml(htmlPath, pageUrl, pageTimestamp) { + try { + let html = fs.readFileSync(htmlPath, "utf8"); + const $ = load(html, { decodeEntities: false }); + const siteHost = this.baseHostUnicode; + const baseDir = path.dirname(htmlPath); + const backupRoot = this.backupPath; + + const downloadTasks = []; + + const handleCssFragment = async (cssText) => { + const { css: updatedCss, downloads } = await this.rewriteCssContent( + cssText, + pageUrl, + pageTimestamp, + { baseDir } + ); + if (downloads.length > 0) { + downloadTasks.push(...downloads); + } + return updatedCss; + }; + + $("img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]").each((_, el) => { + const attr = el.tagName === "link" ? "href" : "src"; + const val = $(el).attr(attr); + if (!val) return; + + try { + const abs = new URL(val, pageUrl).toString(); + const u = new URL(abs); + const normalizedHost = domainToUnicode(u.hostname.replace(/^www\./, "")).toLowerCase(); + const isInternal = normalizedHost === siteHost; + + if (isInternal || this.downloadExternalAssets) { + let fileId; + try { + fileId = decodeURIComponent(u.pathname); + } catch { + fileId = u.pathname; + } + let paths; + try { + paths = this.determinePaths(abs, fileId); + } catch (e) { + console.log(`Invalid path for asset ${abs}: ${e}`); + return; + } + if (!paths) return; + const { dirPath, filePath } = paths; + + if (this.rewriteLinks) { + const normPath = fileId + (u.hash || ""); + const localTarget = ensureLocalTargetForPath(normPath); + const localAbsPath = path.join(backupRoot, localTarget); + $(el).attr(attr, relativeLink(baseDir, localAbsPath)); + } + + if (!fs.existsSync(filePath)) { + downloadTasks.push( + this.downloadAsset(abs, pageTimestamp, filePath, dirPath) + ); + } + } + } catch {} + }); + + const styleNodes = $("style").toArray(); + for (const node of styleNodes) { + const cssText = $(node).html(); + if (!cssText) continue; + const updated = await handleCssFragment(cssText); + if (updated !== cssText) { + $(node).text(updated); + } + } + + const inlineStyled = $("[style]").toArray(); + for (const node of inlineStyled) { + const styleAttr = $(node).attr("style"); + if (!styleAttr) continue; + const updated = await handleCssFragment(styleAttr); + if (updated !== styleAttr) { + $(node).attr("style", updated); + } + } + + if (this.rewriteLinks) { + $("a[href], form[action]").each((_, el) => { + const attr = el.tagName === "a" ? "href" : "action"; + const val = $(el).attr(attr); + if (!val) return; + + try { + const abs = new URL(val, pageUrl).toString(); + const u = new URL(abs); + const normalizedHost = domainToUnicode(u.hostname.replace(/^www\./, "")).toLowerCase(); + const isInternal = normalizedHost === siteHost; + + if (isInternal) { + let normPath; + try { + normPath = decodeURIComponent(u.pathname); + } catch { + normPath = u.pathname; + } + normPath += u.hash || ""; + const localTarget = ensureLocalTargetForPath(normPath); + const localAbsPath = path.join(backupRoot, localTarget); + $(el).attr(attr, relativeLink(baseDir, localAbsPath)); + } + } catch {} + }); + } + + await Promise.all(downloadTasks); + + if (this.canonicalAction === "remove") { + $("link[rel=\"canonical\"]").remove(); + } + + fs.writeFileSync(htmlPath, $.html(), "utf8"); + } catch (e) { + console.log(`HTML processing error: ${e}`); + } + } +} + +export { AssetManager }; \ No newline at end of file diff --git a/wayback-machine-downloader/lib/downloader.js b/wayback-machine-downloader/lib/downloader.js new file mode 100644 index 0000000..c6d2ce6 --- /dev/null +++ b/wayback-machine-downloader/lib/downloader.js @@ -0,0 +1,222 @@ +import fs from "fs"; +import path from "path"; +import { domainToUnicode } from "url"; +import pLimit from "p-limit"; +import { Readable } from "stream"; + +import { debugLog } from "./logger.js"; +import { renderProgress, normalizeBaseUrlInput, isHtmlFile, isCssResource } from "./utils.js"; +import { SnapshotIndex } from "./snapshot-index.js"; +import { AssetManager } from "./asset-manager.js"; + +async function getRawListFromApi({ baseUrl, pageIndex, all, fromTimestamp, toTimestamp }) { + const cdx = new URL("https://web.archive.org/cdx/search/xd"); + const params = new URLSearchParams(); + params.set("output", "json"); + params.set("url", baseUrl); + params.set("fl", "timestamp,original"); + params.set("collapse", "digest"); + params.set("gzip", "false"); + if (!all) params.append("filter", "statuscode:200"); + if (fromTimestamp && Number(fromTimestamp) !== 0) params.set("from", String(fromTimestamp)); + if (toTimestamp && Number(toTimestamp) !== 0) params.set("to", String(toTimestamp)); + if (pageIndex != null) params.set("page", String(pageIndex)); + cdx.search = params.toString(); + + try { + const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" }); + const text = await res.text(); + let json = []; + try { + json = JSON.parse(text); + } catch { + return []; + } + if (Array.isArray(json) && Array.isArray(json[0]) && json[0].join(",") === "timestamp,original") { + json.shift(); + } + return json || []; + } catch { + return []; + } +} + +class WaybackMachineDownloader { + constructor(params) { + const normalized = params.normalized_base || normalizeBaseUrlInput(params.base_url); + + this.base_url = normalized.canonicalUrl; + this.base_variants = normalized.variants; + this.base_host_unicode = (normalized.unicodeHost || normalized.bareHost).toLowerCase(); + + this.exact_url = !!params.exact_url; + this.directory = params.directory || null; + this.from_timestamp = params.from_timestamp ? Number(params.from_timestamp) : 0; + this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0; + this.threads_count = params.threads_count != null ? Number(params.threads_count) : 3; + + this.download_external_assets = params.download_external_assets || false; + + this.rewrite_mode = params.rewrite_mode || "as-is"; + this.rewrite_links = this.rewrite_mode === "relative"; + this.canonical_action = params.canonical_action || "keep"; + + this._processed = 0; + this.snapshotIndex = null; + + this.assetManager = new AssetManager({ + backupPathResolver: () => this.backup_path(), + rewriteLinks: this.rewrite_links, + canonicalAction: this.canonical_action, + downloadExternalAssets: this.download_external_assets, + baseHostUnicode: this.base_host_unicode, + snapshotIndex: null, + }); + } + + backup_name() { + try { + if (this.base_url.includes("//")) { + const u = new URL(this.base_url); + return domainToUnicode(u.host); + } + } catch {} + return this.base_url; + } + + backup_path() { + if (this.directory) { + return this.directory.endsWith(path.sep) ? this.directory : this.directory + path.sep; + } + return path.join("websites", this.backup_name(), path.sep); + } + + async get_all_snapshots_to_consider() { + console.log("Getting snapshot pages"); + const httpOpts = { all: true, fromTimestamp: this.from_timestamp, toTimestamp: this.to_timestamp }; + let list = []; + const bases = this.base_variants && this.base_variants.length > 0 ? this.base_variants : [this.base_url]; + + for (const base of bases) { + list = list.concat(await getRawListFromApi({ baseUrl: base, pageIndex: null, ...httpOpts })); + process.stdout.write("."); + + if (!this.exact_url) { + const wildcard = base.endsWith("/*") ? base : base.replace(/\/*$/, "") + "/*"; + for (let i = 0; i < 100; i++) { + const batch = await getRawListFromApi({ baseUrl: wildcard, pageIndex: i, ...httpOpts }); + if (!batch || batch.length === 0) break; + list = list.concat(batch); + process.stdout.write("."); + } + } + } + console.log(` found ${list.length} snapshots to consider.\n`); + return list; + } + + async get_file_list_by_timestamp() { + const index = new SnapshotIndex(); + const all = await this.get_all_snapshots_to_consider(); + for (const pair of all) { + const ts = pair && pair[0]; + const url = pair && pair[1]; + if (!ts || !url) continue; + index.register(url, ts); + } + + const manifest = index.getManifest(); + this.snapshotIndex = index; + this.assetManager.setSnapshotIndex(index); + return manifest; + } + + async _download_single(file_remote_info, total) { + const file_url = String(file_remote_info.file_url); + const file_id = file_remote_info.file_id; + const file_timestamp = file_remote_info.timestamp; + + let paths; + try { + paths = this.assetManager.determinePaths(file_url, file_id); + } catch (e) { + console.log(`Invalid path for ${file_url}: ${e}`); + this._processed++; + renderProgress(this._processed, total); + return; + } + + if (!paths) { + console.log(`Skipping invalid URL: ${file_url}`); + this._processed++; + renderProgress(this._processed, total); + return; + } + + const { dirPath, filePath } = paths; + + if (fs.existsSync(filePath)) { + this._processed++; + renderProgress(this._processed, total); + return; + } + + try { + await this.assetManager.ensureDir(dirPath); + const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`; + let res; + try { + res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" }); + } catch (e) { + debugLog(`Skipping ${file_url}, fetch failed: ${e}`); + return; + } + + if (!res.ok || !res.body) { + debugLog(`Skipping ${file_url}, bad response ${res.status}`); + return; + } + + await new Promise((resolve, reject) => { + const ws = fs.createWriteStream(filePath); + Readable.fromWeb(res.body).pipe(ws); + ws.on("finish", resolve); + ws.on("error", reject); + }); + + const contentType = res.headers.get("content-type") || ""; + const ext = path.extname(filePath).toLowerCase(); + const looksHtml = isHtmlFile(filePath, contentType, null) || ext === "" || ext === ".html" || ext === ".htm"; + if (this.rewrite_links && isCssResource(filePath, file_url, contentType)) { + await this.assetManager.rewriteCssFile(filePath, file_url, file_timestamp); + } + if (this.rewrite_links && looksHtml) { + await this.assetManager.processHtml(filePath, file_url, file_timestamp); + } + } catch (e) { + debugLog(`Download failed for ${file_url}: ${e}`); + } finally { + this._processed++; + renderProgress(this._processed, total); + } + } + + async download_files() { + const startTime = Date.now(); + console.log(`Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.`); + const list = await this.get_file_list_by_timestamp(); + if (list.length === 0) { + console.log("No files to download."); + return; + } + + const concurrency = this.threads_count && this.threads_count > 0 ? this.threads_count : 1; + const limit = pLimit(concurrency); + this._processed = 0; + await Promise.all(list.map((info) => limit(() => this._download_single(info, list.length)))); + const endTime = Date.now(); + console.log(`\nDownload completed in ${((endTime - startTime) / 1000).toFixed(2)}s, saved in ${this.backup_path()} (${list.length} files)`); + } +} + +export { WaybackMachineDownloader }; \ No newline at end of file diff --git a/wayback-machine-downloader/lib/logger.js b/wayback-machine-downloader/lib/logger.js new file mode 100644 index 0000000..31c2869 --- /dev/null +++ b/wayback-machine-downloader/lib/logger.js @@ -0,0 +1,21 @@ +let debugMode = false; + +function setDebugMode(value) { + debugMode = !!value; +} + +function getDebugMode() { + return debugMode; +} + +function debugLog(...args) { + if (debugMode) { + console.log(...args); + } +} + +function infoLog(...args) { + console.log(...args); +} + +export { setDebugMode, getDebugMode, debugLog, infoLog }; \ No newline at end of file diff --git a/wayback-machine-downloader/lib/snapshot-index.js b/wayback-machine-downloader/lib/snapshot-index.js new file mode 100644 index 0000000..c247530 --- /dev/null +++ b/wayback-machine-downloader/lib/snapshot-index.js @@ -0,0 +1,138 @@ +class SnapshotIndex { + constructor() { + this.byPath = new Map(); + this.byPathAndQuery = new Map(); + this.lookupByPath = null; + this.lookupByPathAndQuery = null; + this.manifestCache = null; + } + + register(url, timestamp) { + if (!url || !timestamp) return; + + let parsed; + try { + parsed = new URL(url); + } catch { + return; + } + + let filePath; + try { + filePath = decodeURIComponent(parsed.pathname); + } catch { + filePath = parsed.pathname; + } + const search = parsed.search || ""; + const queryKey = `${filePath}${search}`; + + const normalizedTimestamp = String(timestamp); + + const currentByPath = this.byPath.get(filePath); + if (!currentByPath || String(currentByPath.timestamp) <= normalizedTimestamp) { + this.byPath.set(filePath, { + file_url: url, + timestamp: normalizedTimestamp, + file_id: filePath, + }); + } + + const currentByQuery = this.byPathAndQuery.get(queryKey); + if (!currentByQuery || String(currentByQuery.timestamp) <= normalizedTimestamp) { + this.byPathAndQuery.set(queryKey, { + file_url: url, + timestamp: normalizedTimestamp, + file_id: filePath, + }); + } + + this.lookupByPath = null; + this.lookupByPathAndQuery = null; + this.manifestCache = null; + } + + buildCaches() { + if (this.manifestCache) { + return; + } + + const manifest = Array.from(this.byPath.entries()).map(([file_id, value]) => ({ + ...value, + file_id, + })); + + manifest.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp))); + + const byPath = new Map(); + const byQuery = new Map(); + + for (const entry of manifest) { + const { file_url, file_id, timestamp } = entry; + if (file_id && timestamp && !byPath.has(file_id)) { + byPath.set(file_id, timestamp); + } + if (file_url) { + try { + const u = new URL(file_url); + let decodedPath; + try { + decodedPath = decodeURIComponent(u.pathname); + } catch { + decodedPath = u.pathname; + } + const pathKey = `${decodedPath}${u.search || ""}`; + if (pathKey && timestamp && !byQuery.has(pathKey)) { + byQuery.set(pathKey, timestamp); + } + } catch {} + } + } + + for (const [queryKey, entry] of this.byPathAndQuery.entries()) { + const ts = entry && entry.timestamp; + if (!queryKey || !ts) continue; + if (!byQuery.has(queryKey)) { + byQuery.set(queryKey, ts); + } + const basePath = queryKey.replace(/\?.*$/, ""); + if (basePath && !byPath.has(basePath)) { + byPath.set(basePath, ts); + } + } + + this.manifestCache = manifest; + this.lookupByPath = byPath; + this.lookupByPathAndQuery = byQuery; + } + + getManifest() { + this.buildCaches(); + return this.manifestCache || []; + } + + resolve(assetUrl, fallbackTimestamp) { + this.buildCaches(); + let resolved = fallbackTimestamp || 0; + if (!assetUrl) return resolved; + + try { + const u = new URL(assetUrl); + let decodedPath; + try { + decodedPath = decodeURIComponent(u.pathname); + } catch { + decodedPath = u.pathname; + } + const queryKey = `${decodedPath}${u.search || ""}`; + if (this.lookupByPathAndQuery && this.lookupByPathAndQuery.has(queryKey)) { + resolved = this.lookupByPathAndQuery.get(queryKey); + } else if (this.lookupByPath && this.lookupByPath.has(decodedPath)) { + resolved = this.lookupByPath.get(decodedPath); + } + } catch {} + + return resolved; + } +} + +export { SnapshotIndex }; \ No newline at end of file diff --git a/wayback-machine-downloader/lib/utils.js b/wayback-machine-downloader/lib/utils.js new file mode 100644 index 0000000..283b48d --- /dev/null +++ b/wayback-machine-downloader/lib/utils.js @@ -0,0 +1,117 @@ +import path from "path"; +import { domainToUnicode } from "url"; + +function renderProgress(current, total) { + const width = 40; + const ratio = total > 0 ? current / total : 0; + const filled = Math.round(ratio * width); + const bar = "█".repeat(filled) + "-".repeat(width - filled); + process.stdout.write(`\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})`); + if (current === total) process.stdout.write("\n"); +} + +function toPosix(p) { + return p.split(path.sep).join("/"); +} + +function relativeLink(fromDir, toFile) { + const rel = path.relative(fromDir, toFile); + return toPosix(rel || path.basename(toFile)); +} + +function ensureLocalTargetForPath(pathname) { + return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".") + ? path.posix.join(pathname, "index.html") + : pathname; +} + +function normalizeBaseUrlInput(input) { + if (!input || typeof input !== "string") { + throw new Error("Base URL must be a non-empty string"); + } + + let raw = input.trim(); + if (!raw) { + throw new Error("Base URL must not be empty"); + } + + if (!/^[a-zA-Z][a-zA-Z0-9+.-]*:/.test(raw)) { + raw = `https://${raw}`; + } + + let parsed; + try { + parsed = new URL(raw); + } catch (e) { + throw new Error(`Invalid URL: ${e.message}`); + } + + if (!/^https?:$/i.test(parsed.protocol)) { + throw new Error("Only http and https protocols are supported"); + } + + const asciiHost = parsed.hostname.toLowerCase(); + if (!asciiHost) { + throw new Error("URL must contain a hostname"); + } + + const bareHost = asciiHost.replace(/^www\./, ""); + const unicodeHost = domainToUnicode(bareHost); + const port = parsed.port ? `:${parsed.port}` : ""; + const basePath = parsed.pathname && parsed.pathname !== "/" ? parsed.pathname.replace(/\/+$/, "") : ""; + + const canonicalUrl = `https://${bareHost}${port}${basePath}`; + + const hostSet = new Set([`${bareHost}${port}`]); + if (asciiHost !== bareHost) { + hostSet.add(`${asciiHost}${port}`); + } else if (bareHost && bareHost.includes(".")) { + hostSet.add(`www.${bareHost}${port}`); + } + + const protocols = ["https:", "http:"]; + const variants = new Set(); + for (const protocol of protocols) { + for (const host of hostSet) { + variants.add(`${protocol}//${host}${basePath}`); + } + } + + return { + canonicalUrl, + variants: Array.from(variants), + bareHost, + unicodeHost, + }; +} + +function isHtmlFile(filePath, contentType, firstBytes) { + if (contentType && /text\/html/i.test(String(contentType))) return true; + const ext = path.extname(filePath).toLowerCase(); + if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true; + const head = (firstBytes || "").toString("utf8", 0, 512); + return /]/i.test(head); +} + +function isCssResource(filePath, resourceUrl, contentType) { + const ext = path.extname(filePath || "").toLowerCase(); + if (ext === ".css") return true; + if (contentType && /text\/css/i.test(String(contentType))) return true; + if (resourceUrl) { + try { + const u = new URL(resourceUrl); + if (/\.css(?:$|\?)/i.test(u.pathname)) return true; + } catch {} + } + return false; +} + +export { + renderProgress, + toPosix, + relativeLink, + ensureLocalTargetForPath, + normalizeBaseUrlInput, + isHtmlFile, + isCssResource, +}; \ No newline at end of file diff --git a/wayback-machine-downloader/package.json b/wayback-machine-downloader/package.json index 42d5ba7..9b7c15a 100644 --- a/wayback-machine-downloader/package.json +++ b/wayback-machine-downloader/package.json @@ -1,14 +1,22 @@ { - "name": "wayback-downloader", + "name": "wayback-machine-downloader", "version": "0.2.1", "description": "Interactive Wayback Machine downloader for archiving websites locally.", "type": "module", - "main": "downloader.js", + "main": "./index.js", + "exports": { + ".": "./index.js", + "./downloader": "./lib/downloader.js", + "./downloader.js": "./lib/downloader.js", + "./cli": "./cli.js", + "./package.json": "./package.json" + }, "bin": { - "wayback-downloader": "downloader.js" + "wayback-machine-downloader": "./cli.js" }, "scripts": { - "start": "node downloader.js" + "start": "node cli.js", + "download": "node cli.js" }, "dependencies": { "cheerio": "^1.0.0-rc.12", @@ -17,19 +25,25 @@ "engines": { "node": ">=18" }, + "files": [ + "cli.js", + "index.js", + "lib" + ], "keywords": [ - "wayback-machine-downloader", - "web-archive-downloder", - "archiver" + "wayback", + "archive", + "downloader", + "wayback-machine" ], "author": "birbwatcher", "license": "MIT", "repository": { "type": "git", - "url": "https://github.com/birbwatcher/wayback-downloader.git" + "url": "https://github.com/birbwatcher/wayback-machine-downloader.git" }, "bugs": { - "url": "https://github.com/birbwatcher/wayback-downloader/issues" + "url": "https://github.com/birbwatcher/wayback-machine-downloader/issues" }, - "homepage": "https://github.com/birbwatcher/wayback-downloader#readme" + "homepage": "https://github.com/birbwatcher/wayback-machine-downloader#readme" }