feat: add support for Cyrillic URLs and paths

This commit is contained in:
User
2025-09-28 15:27:38 +02:00
parent 7541f2ebee
commit f840c4a8f1
4 changed files with 29 additions and 11 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@@ -12,6 +12,20 @@ This webarchive website downloader has an interactive interface, supports downlo
--- ---
## Table of Contents
- [Features of Web Archive Website Downloader](#features-of-web-archive-website-downloader)
- [Special Features](#special-features)
- [Requirements](#requirements)
- [Installation](#installation)
- [Run](#run)
- [Example](#example)
- [Common Issues](#common-issues)
- [(Important) Download responsibly](#important-download-responsibly)
- [Contributing](#contributing)
---
## Features of Web Archive Website Downloader ## Features of Web Archive Website Downloader
1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets. 1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets.

View File

@@ -1,11 +1,11 @@
/* /*
* Wayback Machine Downloader 0.1 by WhitelightSEO — Interactive (Node.js, ESM) * Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM)
* Run: node downloader.js * Run: node downloader.js
*/ */
import fs from "fs"; import fs from "fs";
import path from "path"; import path from "path";
import { fileURLToPath, pathToFileURL } from "url"; import { fileURLToPath, pathToFileURL, domainToUnicode } from "url";
import { mkdir } from "fs/promises"; import { mkdir } from "fs/promises";
import pLimit from "p-limit"; import pLimit from "p-limit";
import { load } from "cheerio"; import { load } from "cheerio";
@@ -117,11 +117,12 @@ class WaybackMachineDownloader {
try { try {
if (this.base_url.includes("//")) { if (this.base_url.includes("//")) {
const u = new URL(this.base_url); const u = new URL(this.base_url);
return u.host; return domainToUnicode(u.host); // use human-readable domain
} }
} catch {} } catch {}
return this.base_url; return this.base_url;
} }
backup_path() { backup_path() {
if (this.directory) { if (this.directory) {
return this.directory.endsWith(path.sep) return this.directory.endsWith(path.sep)
@@ -172,7 +173,7 @@ class WaybackMachineDownloader {
const url = pair[1]; const url = pair[1];
try { try {
const u = new URL(url); const u = new URL(url);
const file_id = u.pathname; const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
const prev = curated.get(file_id); const prev = curated.get(file_id);
if (!prev || prev.timestamp <= ts) { if (!prev || prev.timestamp <= ts) {
curated.set(file_id, { file_url: url, timestamp: ts, file_id }); curated.set(file_id, { file_url: url, timestamp: ts, file_id });
@@ -190,6 +191,7 @@ class WaybackMachineDownloader {
"%" + s.charCodeAt(0).toString(16) "%" + s.charCodeAt(0).toString(16)
); );
} }
async _structure_dir_path(dir_path) { async _structure_dir_path(dir_path) {
try { try {
await mkdir(dir_path, { recursive: true }); await mkdir(dir_path, { recursive: true });
@@ -265,7 +267,7 @@ class WaybackMachineDownloader {
let html = fs.readFileSync(htmlPath, "utf8"); let html = fs.readFileSync(htmlPath, "utf8");
const $ = load(html); const $ = load(html);
const site = new URL(this.base_url); const site = new URL(this.base_url);
const siteHost = site.hostname.replace(/^www\./, ""); const siteHost = domainToUnicode(site.hostname.replace(/^www\./, ""));
const baseDir = path.dirname(htmlPath); const baseDir = path.dirname(htmlPath);
const downloadTasks = []; const downloadTasks = [];
@@ -281,16 +283,17 @@ class WaybackMachineDownloader {
try { try {
const abs = new URL(val, pageUrl).toString(); const abs = new URL(val, pageUrl).toString();
const u = new URL(abs); const u = new URL(abs);
const isInternal = u.hostname.replace(/^www\./, "") === siteHost; const isInternal =
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
if (isInternal || this.download_external_assets) { if (isInternal || this.download_external_assets) {
const file_id = u.pathname; const file_id = decodeURIComponent(u.pathname);
const paths = this._determine_paths(abs, file_id); const paths = this._determine_paths(abs, file_id);
if (!paths) return; if (!paths) return;
const { dir_path, file_path } = paths; const { dir_path, file_path } = paths;
if (this.rewrite_links) { if (this.rewrite_links) {
const normPath = u.pathname + (u.hash || ""); const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath); const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget); const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath)); $(el).attr(attr, relativeLink(baseDir, localAbsPath));
@@ -315,10 +318,11 @@ class WaybackMachineDownloader {
try { try {
const abs = new URL(val, pageUrl).toString(); const abs = new URL(val, pageUrl).toString();
const u = new URL(abs); const u = new URL(abs);
const isInternal = u.hostname.replace(/^www\./, "") === siteHost; const isInternal =
domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
if (isInternal) { if (isInternal) {
const normPath = u.pathname + (u.hash || ""); const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
const localTarget = ensureLocalTargetForPath(normPath); const localTarget = ensureLocalTargetForPath(normPath);
const localAbsPath = path.join(backupRoot, localTarget); const localAbsPath = path.join(backupRoot, localTarget);
$(el).attr(attr, relativeLink(baseDir, localAbsPath)); $(el).attr(attr, relativeLink(baseDir, localAbsPath));

View File

@@ -1,6 +1,6 @@
{ {
"name": "wayback-downloader", "name": "wayback-downloader",
"version": "0.1.0", "version": "0.2.0",
"description": "Interactive Wayback Machine downloader for archiving websites locally.", "description": "Interactive Wayback Machine downloader for archiving websites locally.",
"type": "module", "type": "module",
"main": "downloader.js", "main": "downloader.js",