feat: add support for Cyrillic URLs and paths

2026-01-29 01:40:41 +00:00 · 2025-09-28 15:27:38 +02:00
parent 7541f2ebee
commit f840c4a8f1
4 changed files with 29 additions and 11 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/README.md
+++ b/README.md
@@ -12,6 +12,20 @@ This webarchive website downloader has an interactive interface, supports downlo
 ---
 ## Table of Contents
 - [Features of Web Archive Website Downloader](#features-of-web-archive-website-downloader)  
  - [Special Features](#special-features)  
 - [Requirements](#requirements)  
 - [Installation](#installation)  
 - [Run](#run)  
 - [Example](#example)  
 - [Common Issues](#common-issues)  
 - [(Important) Download responsibly](#important-download-responsibly)  
 - [Contributing](#contributing)  
 ---
 ## Features of Web Archive Website Downloader
 1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets.  
--- a/wayback-machine-downloader/downloader.js
+++ b/wayback-machine-downloader/downloader.js
@@ -1,11 +1,11 @@
 /*
- * Wayback Machine Downloader 0.1 by WhitelightSEO — Interactive (Node.js, ESM)
+ * Wayback Machine Downloader 0.2 by WhitelightSEO — Interactive (Node.js, ESM)
 * Run: node downloader.js
 */
 import fs from "fs";
 import path from "path";
-import { fileURLToPath, pathToFileURL } from "url";
+import { fileURLToPath, pathToFileURL, domainToUnicode } from "url";
 import { mkdir } from "fs/promises";
 import pLimit from "p-limit";
 import { load } from "cheerio";
@@ -117,11 +117,12 @@ class WaybackMachineDownloader {
    try {
      if (this.base_url.includes("//")) {
        const u = new URL(this.base_url);
-        return u.host;
+        return domainToUnicode(u.host); // use human-readable domain
      }
    } catch {}
    return this.base_url;
  }
  backup_path() {
    if (this.directory) {
      return this.directory.endsWith(path.sep)
@@ -172,7 +173,7 @@ class WaybackMachineDownloader {
      const url = pair[1];
      try {
        const u = new URL(url);
-        const file_id = u.pathname;
+        const file_id = decodeURIComponent(u.pathname); // decode Cyrillic paths
        const prev = curated.get(file_id);
        if (!prev || prev.timestamp <= ts) {
          curated.set(file_id, { file_url: url, timestamp: ts, file_id });
@@ -190,6 +191,7 @@ class WaybackMachineDownloader {
      "%" + s.charCodeAt(0).toString(16)
    );
  }
  async _structure_dir_path(dir_path) {
    try {
      await mkdir(dir_path, { recursive: true });
@@ -265,7 +267,7 @@ class WaybackMachineDownloader {
      let html = fs.readFileSync(htmlPath, "utf8");
      const $ = load(html);
      const site = new URL(this.base_url);
-      const siteHost = site.hostname.replace(/^www\./, "");
+      const siteHost = domainToUnicode(site.hostname.replace(/^www\./, ""));
      const baseDir = path.dirname(htmlPath);
      const downloadTasks = [];
@@ -281,16 +283,17 @@ class WaybackMachineDownloader {
        try {
          const abs = new URL(val, pageUrl).toString();
          const u = new URL(abs);
-          const isInternal = u.hostname.replace(/^www\./, "") === siteHost;
+          const isInternal =
            domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
          if (isInternal || this.download_external_assets) {
-            const file_id = u.pathname;
+            const file_id = decodeURIComponent(u.pathname);
            const paths = this._determine_paths(abs, file_id);
            if (!paths) return;
            const { dir_path, file_path } = paths;
            if (this.rewrite_links) {
-              const normPath = u.pathname + (u.hash || "");
+              const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
              const localTarget = ensureLocalTargetForPath(normPath);
              const localAbsPath = path.join(backupRoot, localTarget);
              $(el).attr(attr, relativeLink(baseDir, localAbsPath));
@@ -315,10 +318,11 @@ class WaybackMachineDownloader {
          try {
            const abs = new URL(val, pageUrl).toString();
            const u = new URL(abs);
-            const isInternal = u.hostname.replace(/^www\./, "") === siteHost;
+            const isInternal =
              domainToUnicode(u.hostname.replace(/^www\./, "")) === siteHost;
            if (isInternal) {
-              const normPath = u.pathname + (u.hash || "");
+              const normPath = decodeURIComponent(u.pathname) + (u.hash || "");
              const localTarget = ensureLocalTargetForPath(normPath);
              const localAbsPath = path.join(backupRoot, localTarget);
              $(el).attr(attr, relativeLink(baseDir, localAbsPath));
--- a/wayback-machine-downloader/package.json
+++ b/wayback-machine-downloader/package.json
@@ -1,6 +1,6 @@
 {
  "name": "wayback-downloader",
-  "version": "0.1.0",
+  "version": "0.2.0",
  "description": "Interactive Wayback Machine downloader for archiving websites locally.",
  "type": "module",
  "main": "downloader.js",