From 7541f2ebeeeba30aaa566eb6f645ac9f311b4ae6 Mon Sep 17 00:00:00 2001 From: User Date: Sat, 27 Sep 2025 16:41:13 +0200 Subject: [PATCH] init: initial project setup --- README.md | 114 ++++- assets/webarchive-downloader.jpg | Bin 0 -> 23164 bytes wayback-machine-downloader/downloader.js | 508 +++++++++++++++++++++++ wayback-machine-downloader/package.json | 35 ++ 4 files changed, 656 insertions(+), 1 deletion(-) create mode 100644 assets/webarchive-downloader.jpg create mode 100644 wayback-machine-downloader/downloader.js create mode 100644 wayback-machine-downloader/package.json diff --git a/README.md b/README.md index 8f3501a..c6e24df 100644 --- a/README.md +++ b/README.md @@ -1 +1,113 @@ -# wayback-machine-downloader \ No newline at end of file +# Wayback Machine Downloader JS + +![Web Achive Website Downloader](assets/webarchive-downloader.jpg) + +A script written in **Node.js** for downloading websites from [Web Archive](https://web.archive.org/). + +Intended for use by: +- **Webmasters** — to restore their lost or hacked projects +- **OSINT researchers** — for local work with resources that no longer exist + +This webarchive website downloader has an interactive interface, supports downloading with either original links preserved or rewritten into relative ones (for local usage). + +--- + +## Features of Web Archive Website Downloader + +1. Download entire websites or individual pages from the archive, including HTML, images, scripts, styles, and other assets. +2. Rewrite internal links for correct local browsing. +3. Multithreading support. +4. Save results into a chosen folder while keeping the original structure. +5. Ability to download external assets (e.g., images or scripts from a CDN). + +#### Special Features + +- The script fixes parameterized file names such as `main.css?ver=1.2` into `main.css` for proper local work. + +--- + +## Requirements + +- Node.js version 18.x or higher + +--- + +## Installation + +```bash +git clone https://github.com/birbwatcher/wayback-machine-downloader.git +cd wayback-machine-downloader + +# Install dependencies +npm install +``` + +--- + +## Run + +```bash +node downloader.js +``` + +After launching, an interactive menu will appear with the following questions: + +- base URL (e.g., https://example.com) +- date range (from/to) +- number of threads +- link rewriting mode (keep as-is or convert to relative) +- whether to remove `rel=canonical` from the downloaded site +- whether to download external assets +- directory for saving the files + +--- + +## Example + +```bash +node downloader.js +``` + +Dialog example: + +```bash +Enter base URL to archive (e.g., https://example.com): https://example.com +From timestamp (YYYYMMDDhhmmss) or leave blank: 20200101000000 +To timestamp (YYYYMMDDhhmmss) or leave blank: 20201231235959 +Rewrite links? (yes=relative / no=as-is, default no): yes +Canonical: "keep" (default) or "remove": keep +How many download threads? (default 3): 5 +Only exact URL (no wildcard /*)? (yes/no, default no): no +Target directory (leave blank for default websites//): +Download external assets? (yes/no, default no): no +``` + +After this, the archive download will begin. + +--- + +## Common Issues + +#### Script downloads only the homepage +**Answer:** try specifying the base URL with `/*` at the end. +For example: `https://example.com/*`, or try downloading a different time range. + +--- + +## (Important) Download responsibly + +Please note that downloading third-party websites may violate copyright laws. +Use this tool responsibly and make sure not to break the law. + +--- + +## Contributing + +Pull requests are welcome! +For major changes, please open an issue first to discuss what you would like to change. + +1. Fork the project +2. Create your feature branch (`git checkout -b feature/fooBar`) +3. Commit your changes (`git commit -am 'Add some fooBar'`) +4. Push to the branch (`git push origin feature/fooBar`) +5. Create a new Pull Request diff --git a/assets/webarchive-downloader.jpg b/assets/webarchive-downloader.jpg new file mode 100644 index 0000000000000000000000000000000000000000..75f3e9fe97820aa6781adc1d7f1b58a61d475ff7 GIT binary patch literal 23164 zcmbTd1yoy6w=fu-LV-eYcMtAbio3hC6aoYb5VS~f4-|^KyA*dyad+1O1&V8N%A}vY z_s_g}v(}uftlWEK`#yKyeRiG~pEm)-vYuco06`NqRb&TiAhIsXu_M!H(jz zr)}S8slk@wwEFz2T&hmeARDm4M;DOxM>QRbk9HQqmb8)*)MB0@o(@h9AXjs0PX~KP zsEDUH?LUBvz_0&ybJ9}(V~VStIIYxQtkedo8r0Gd7Z9~D2RG1yi;IU^P?*Dtm!A)0 zWg*DLPR+x`!^_Fd&B-MUBTfq+;jjc-itq^VeGn4h;{jT63vvVbthj}N z=DgfiKz@GS4^~1#T*4MY-2aC2KdpZ&%_q$zC?_S%E5pOhEypD+AS5RwBqhTqC&0@u zEx`ZpwTg~VS93=T(7$nm;kf^`R``EiDau#0Lw18QzAUC0D=LpFflP-V!nF$@)Z#d77h^^{;OB`WK=|d8xuR!gDX60lsq}A|W9B z*Pa4EK}A49MnwC6^aGF)kx&p&UjY!15D}5lF;P(vQ4x{g{J{a>BIBXZaKA>C(sT}p z%caFPulYX3qZJrGtxfpWC67*rVDm$5w=}QJ+4+{m7y5h#Xb>MR01*KZ2^kF)1?4XV za5#tnTqGJiWNytkEh%&70NU3mxjY}@wejJw-V$t{y9CmmN$2s>Pd_gMUcwh3;v(S! zBmsB#w74Fe8O1(5h%4a{L54!xGr(d@LVgR-0Sn zyYQmu;bLdra>s3qGG447YRFntHM^4R-eZ^5Obnx8x8=uz<1Kqed+I4_cSL^ak3~*- zhr)XA`MbQrC3MS_MVmt`LynPx>t_lTOr&cPu1R_v`e)6T853l*J_nm^MJi^|MHEhm z_-i52Hq$Bliy1wZ;W)ouf6owMwKJ-ztR+XA@DvF#as5Q_^Jg-WcEB?rMwd!rLFmha z@-skXaMpB>v5nMUYyS=2 z3)Rx@4^PSS;r5KY?dVSfBGuzfqYTdgaLK&`T$toVd*7b{gyR6Z&A=dN7c;PA57(^TQ0^eO)2{K<_<9@ zpe?^ex_PU)wrO2&2L6<1)1+yB^d=uL_skDNlqdV?PKc4|YFbhV@-h}MBA+WVvM?Lk zA^fukJ#dQ^@es|{uJBs;yPEyM`x&6LH)s|77M)L|8$k=Osg>yV7PvOPH+x9wdg38b zY;ak!XJ+1Se0}pLapLnBbB7|?wmVTIh23yMMc@nMS8|i)guf~a5}Q-svc2u_AdBJn znrAA99AUd(&GoWzEnx9Y#hz@VGC0qXsS&naVw5a}W?oS_H90j~y4w9YYKC~9oOnu~ zLdxMp|1YwfuVf0ZOKrC!^F(2Zx{Nlm-&#!WGcCsJTMDayPPwG$Z{EAVdVgKT$Pt~m zi>yKmjQh@>ld_`0zmHD;QtfqQ#Q%= zwc53_{aAC7(6Lvy`0m9r^GHC{ies1;MyS5;cI~OK#p;y)_q)oUl*w8JKnJW1p029d zGs@#QinpL<55Y(T>RB5av)Z;?ZwvfqfMbf~!||lgL1gib+9k?plB0NPC?n3EV`to! z$-?r4HP;HHZ*7L5uqA0ngbMpl1gJarCVs%dii23xeYd!~r#jy#tGiwCwWUSAXSo@s ziO}~+J=OC5wLa$u^|~TSC$zW41?m}WX2+`z6)N;|lOkCo^|7p#X2z}a2gV+Du)cx^Gj-~{mwYQx9T@?j&abV z`VQOVYLVqL!2E9PmXHg0t7|Nr&Qp`L$BSh)80y z0w9pPU)tKs7}@Fsbrbju(9PFGJ7y;Ei6M*1|d7!D=zMCNV?}W>Fi&1LO|)i-uRqgEPwyouNSY z84~?*`?mHKE1Z$!(p7O*VH3`ZzPa+v2N~V4h#swyS4Il7!8SCK1#OwkzCU}~@~jSl zmMtmX)v=#czV7fGACqDy4=jo;w=)Y6THY(AwMax^wv>;dxZIgc#6vo0Kf(cq&Ga%~?>mvx zg-EkSfOPe)H`+`Ui2(eNHIcUvW!DCc%#@g+%<9@Q**^TSAZ?$uM7e?03?pSDW%&>l z;smUW6|$0_A_4g=*nt(d`1~^EL6b5TGgUauC0*qf8^SN8-uJeiafq(`s#u*DE*y8N zM1^4_u=htr#B}V)jt(%t`e5cR3-$X=H6(F%$#lx?lRr(cZhG>rTleN#qWJ5Wa0W8! z$Why1bM6;wws*a*7fw|V(c>>RedGjRN=ubYdg9w!P1>iLXy>JZS(}1SWNk4`4Nm3b5^=1 zo#Y$~)(a<)I*#nBpLd%d9wT=u+&FrM?J?NzE_bu-^GaO+4|uv&h!BdWK3QIzu|RQk zOc9KmNGfx`?Jg2X(4RHA_Xt5Y2M8xWs+=>=iG85g*hhK0Dr25Hm4%f5CBWGH0OO>e z?jY$6sQufw?zMc&6$_3RU#Vp8g>UiGf{+8z=lNMu-qlA`Q`vxbHcdL4LpOrkrcZT^ zrX=nSx_%u8HOGk+@!S4QPKwBy{S z4lWQj3)!LOs?=;A+4UD=l<3kVE9;5SRkq60Xm>UkokV>E9W|-e)&A&t3_jAl3pOxg zAkECV2%&B;>1zlX+ng7ETeJdtZ%Zu z;WTRcqNr%;uozd7{_CcOJaOJI{a)VZ$!{RGcL$kb8LXhUgDiAmxDxye5iDs3dctto48E`P1LY=a;JqC%#7pLb~Pq|9V}SP*wKy)!IUB#L^bCfQ-jkB^;2WC z!MObmm-$Ie8huWn<4DaA(uArWiKOX;nrd*l2?AP|o>LXpTJx!}402j4*9VW996wSP zR!)9rB~hBEX#L=6y|SQoJ*}I@6_DRh*OVyV)2@xUj>gyFUm(lT$zQAF>w0PBF*hp? zJoDqIDU>fBs*kc-S&h~vzoix#Z6}p@e=Cr?9dV7D*~(T7IZh)~nUIqU6O8LFUnd}G zy&>}{gfUhH^z#miN|%&=WLGZVS#{m{K=GF&UdrmNMIerwTV?etxpNossP?Rv1NjfriywQG&yIMo@{pp3iBZCgG0feGtC{e^vXC{|&H)gh?#hio*()cPDEAP4FqZ#< zM8twEmdO_|2;(&5Lv-sNLy{^4ll;R29v6RaRxeW`FBGpgR_Ve>ei6AA>s1V+FE(dh zh%z*v+(mOR!n~H+;+Dm1_d|TH@UY|>t}tJ;%_Rtu^KcRQ9;+SdeT>&A{64cOnXn5x z`x^p?nHpn8fz2g3wgfltEurMQ;?;29kOFVfn6d9V%vI=PJp<@$<#A`_mGar-4kqdwlU6gi zg@jxrZ$mznFgT#?OCRxcP>fdT=Z4iQ`d}1h8+mx&2x0J;N4q>#H9)&_o3a+bADECD zkOCy#*1jyrB*mN<4~tOs3|QLfv>a0$ZYA1Sy7JWO-K#h`?egopo5#W8T8+-gg|QQ5 z^~nQGI}Hik1`HImg^oZPo?o>ZVn#uB+h)K~Omn9`QD$FdM9oUrj{l`2VOa2nY?)pJ(84$tpAo78#j z;lanG;s6P5_F}Ctbcm>}x5RdW_9=(YX?eaDS!qlqVq@ciEZc>t~Dzw z&=4Jfgq*9P6<|^f^Tm?Z{?Ih;Bm3k?e`vQX&r#5rw8P0{IMUBsicrzK%GRx`TgUOG z-B6aXNgK1pI9F#b$Q{%>R5`IkbG$d{qxfVyKkq1Qw*7j_WW+sVD|1Wfhemo2iU2Q` zk%5oJaD@R8D%7E2WG82aY8S`)otC^h6gVsHx=lIz^4i2rKa{5+JnDnRTUuH?X@+F1 zsqljY?F=x6H)5GIOQ@C7n!S^Uls=>Xby$<~KXwtnuOO$@o@K`3;&OQn;3s)Ev?x#~t>eB%3M!CP zxdjtjb?nzhoYhTne44a6LCI!mI_?qrTB*|K4QH7a_3P3(B@Hvl%C+s$nQB7nboD^Y zk+c=;vY7q2!j_$NxnEVj08I;vR_RrF<7LwVxU{K_)@pV5lc(D*V)IB5NkzQx#Y^p79cA!?{C;;63v&Za9EalOI8 zgz0)1MXUrvW{$0AF(QLeVmF1~nR9u)B!D24n0dn{%0%8K$f{>OY0KPOWyITVbpT?g zWpBu0bzDQXjJ?bO?HUy1wx)JjzzQ|%7jwg{oDH&IMbinDdR$oTUDw&{)mdEkum@n* zUuqib?KwT3+>eDwBHNiupY(ardr9}2v{W}4p z9i{^B9?KtYnLVT?#yMb-x0mF2gWTNB{!(hXsO8Ctu~+D;Ifn(dFTOmnyU>&;|H7zOU} zz~p7Nzv-YaCApb93NPk{z8DXWbT;@uf)D+*`HYb-GT2{UXOip+g=y+A zIIgCaL)=h&i5h5_6RDO*IbrNnQRKy~BwezYUSMfA~s_JRX?7YN`TG@NrKVed%ob;ZSN)V9sWoXVnyc%Fzst zF&MGz@)1w=B1nodYziJ<$&k$GPeOyCy_g9;kF1PpjmkUjae7(kKPqxx49wLXX-pSo zqI7_)^T1zAljXM7^`fOKIMb`kUZGPTh%Ahpd{|sn#u@QWJxK#{D*A9RZjnVB>ou1i z;|;WEPaCIy@MUb2i*0qvCe|Bfr94!v9XBV(3*)+M%&%ZOFrHoD$jNfUy7SW-a`v_LjrS1IAbS^aSK#iby2-wactfGNAq2@T`UXf$N78aj=5yJ zzv?B+8gDn#<&@ona41)0RLM-rm6fg|S8YI~*{gOM+{!b4!7ytA%wD+>Y*C!ndiPj` zobx^9NO$eb>NwtfuN#x`w@DLd%h2tVMQu3xRlmTVze8Oc`FQ}MqItTePuqsQe>E>0 z&t1zE2G77&k%o&%8N?GRljVQ@VDa59U=vMBQ?$@!|DxwLOto!9eoZ`go7FLj@qkR< zknKcZqgx1;Dz(lu264y)djRVidbgI(W01xPid~DAL8rloq0;KD=kF+V+;5HT>(KX* zWR7NTQuF&{5202)USi0<;<+}5d!7MW^Rt4PxkMr^FW%r_{}#6cPg;+osijZS~2lK#YVI(LF<$;gylOp)2c zn>CX71oBfqo?fHrjxs$17B@sDd}<|6Xm3{b^axvJaZ zaWOVKbZ-xHMC1y(bI1eHm$58kXv2zpdIPbr1NqB2evRm@c6nl&2_MXCVNeMLwe5Ov zYx#>c;~+g)^tX9vsR-vw&v#LOzCiGWFAVdcpc)k&I2S&}rb@q!?p+^pJqkZCBLL`ixU*+YMsrRaZ4GxU~9zB0}2Gn+* z-b5Y1y0fS}ZXln_l&RA}sr}fcQqhEBJ z;XKcP052b@V<F&p9KFrxQHP9;$cmJwL zATH3-K^w(jpra3RP6NTP1O}f0YZnt)A{4u3`J?s>-WD-dEhl_7GX+?4p;eY%nrBHK;ac(pnC#;-z$?nlWfio#s2oz)dwo7Cu_7hb z&Sr(u*Ea27kBQx!CC`2ArBj6R#pAnkGL)po7M{yonN^xUlM1vcLXuuK@-lt~@Z(ji z^q{g)6u*R^rCj@z)z;U?rVZZO)^Qrtw-2Pj7%yIM^BCYP(e!-&CBXm;(04I(Hw=D% z5MLp7-#S-}sIAAdCi!ZNYRaVYvoN+HNt#rBZL(Zaw8hfQmgW+^L1B^_YQ5mPT7`+yypA8`SE# zbV&0wUJ@Igr>P9xIZG*w2Oc;YJvE`Fb+nPX@IC{KuIjYu;o*V9wAmrrT#{~OZ_D%35%on4x!%{u~c`foZrm$iv{}-HAW`Dj;*g_l!*IK5q3Rtt@lzh%lle2CUxMGLqwypeYW$(E#yd4Kp%S~5v#2@?g5wFcfr>alf;@VNj;;7F-p(}UDcs44IWg)7GVO^m0$|b zqOv_hwD4qu{^>#B&X6ON*YUwbD8H8~d`7AepL+Fi@vK+a%#XlRcE#vn{{5Y?GHvyJ>d-f8N4AAB1{@fQm-zCH=W}>k1iTWo-PuQT58jS9;?0cO zv{z+}HABgY&Q+Oy6O49frIm~0w%iZk#wXtxLHm-YyYVYj|NjCZs} zj|r&2oW$Ph0(fG!9p;SV_si;9dX!-(O~n#c;aJy8o;oroo{-n)ND^@}c{eh7T@`*B zb*UWeydx37bqg;iPw1xrYyIaO zDIm=ly%1?2->bK4#)Q8`>_A^3?0{_t8JSf(57KmA`)sxD_D+G0!dQecr~EY6Kpe#A zwbdV1dJinIe^%+59*<|K_O$U07I2S+<-R9812$vILp9`jm<_MhjvBY|-=`beT1b>` znqM#%US)K5vppyR3un)lJjympzxy0`(l@1^DRB*vMVW}vIN0z@xPUCSRXZ$vb& zwh_kr{gTel3B^-U2H z*Y!vzK{T5?{7b|)S#j4lBjBOUI~OsmR$L` z^Poc(_XS|1UtJyGK|GuGpgX_Gd_|=+f2ZU4uy04}RuAPFfOQzvvTVC-o0lD|wEnyA z8BmSfSw7;s`R-UZ(Rn>dpRKx_PCeR7SJ?L6Td(WjgQQSQ`7@wC^M~T`+?UJZuz4F# zD%Y8Z#8`~7lkY_~XcYrExEE}p_(|V-d>)l>c1WiMAN^Mq;+E=#E)GM_x!Tr_y$Be5 zvA5WrlqL*gC}@1ndG{-IoLd66S>_*xKbDHiU6RA%I=`Sh?Vy%j4=eO{bVhYK{S?uC zVd^3*aA$%R;{Wx5F{spIR=+#8d1o8z^;D`yE+aN{jX??PwVbgg%+)8q6SoAMbXhZx1M1l`6en45!qr{3Bk>mXH4~kBf`z@jN>!- z^QL*0WbYuB^Wjli*2>r?RT8k^?b>sJGBP0p#{?V4tPJ%nK6RU!&9P0YfdbJ7M)sJr7#RO$l8b+s<_u=xUW^Y z5h`LpulzsT@)4jll7VKFG&`Qgp}bf_X`4H4evbp*eerh6cidr7=3`-&+ROr!b zDGFxcEWNDG67Tn6Wx=PMYwDXa%jm9EI_W&*xh7S72OpdDL-k<`J-OJrwh$kw8GM5> zi}dEF5w0`)vb2tjc83l+|A4uimHE6Kh6Y5y6@a+?X6hH52%FaHsr^jd{W`7+f+gR1 zHq-ui{R8{c-&6tF7{2P6fAY`%Qkr``FOm2yLG7+dcw?7G|G?+}J|VWT=Bn~BkkVn? zW6QF7_HN@q5=nx93JT?)J#EfhEc8nj@z)E?z|>+XEVr=@4|mtqdC~7-PmSCXQq5MpE8?y{4fb>@1W6p449MZhO6DB)Q10J_-jP^)G?lN&=ChHcS3#p?-E3qpQ^I=jqhYm)1jHoL!Zj(zWtRHTW|VEro!}$ixE!QeEn5uN&TA{ z12Q0dp4dvpaX)ArnqjhKvSq6{t;jqe9Amd^v-?@Jh6RZ zj=yw-3X&|PSS5RXNjc10ApVX8v~lcIlkE?$)YV1MK|y+tM;~vis!lXDF;;dj*&WtBN-W6q1G-(bc~OvoJxqErI`E<5Uh7e>EyL zk}=lrZnnFyaAe0(fMbVcykzH^-HNyy2nOz2PLcD-NvF9@#^`Pm1mNnkmS7H}5)UTP z&&I3WvHa$MUJf)*gh&*Q`+44woR!pT{L|m)+ehH)w{OlePR$k@z>np3&LI#RF{1qn z03ZSAncQNWea&CzH9FamAIT5~6(wx`G?mn`f;(vE4^8yH8cedYQCwE|LPU;%`d-vB zWY;Ujtm}QVwh^1PZsHl;-X7+04S-3C4;gH2 z2P)fpvCi2TQL_CF4r7J{G1ml{*u%`kF+KEu^2h!-{Ux`jU=H_%16~V(R+~$tRBHM< zLuq^WQT-U?Xq(%3Bp`T;7AB$=TaSacEjzDkE{71h1sC_<9E_Io0r|$>KdcuUa`_B^ zJ0fW$rEGRFWCzd8VcdON!zhU|eJ)D65{tk=d{xrG=Uqoz( z?1?`1yQ`j3E+2Pm&5VNILGv#Z_|oUn`pa-)fx* zI>k4JuN?+(Idl3}+*`q$IvcYL87)2Vf;F>JUl{+$T`lNrx}|kTieO~_DF0`+@4D-~ zH}jqRhDWC6BP?@42xtD?qr=nnZT7Ubrit3|4wE*eFr`oV_cfod2Rt$IIBc4aA+fQ+ zn^%X=05lR!S@0LvV{}pBpLWgMfX!LNs`aH2kjtT$FXKM4t&>Q&{_^Z{tAccRf3TLO zhpPJ|$^ue|9PeA&T99bN%db3JzsDKhkT0fzX3cr@M?Jv7wtdm8L^*w^DZ401KB)QmxR0eF*9Gs zP#f`Yip(D0!@}No@>fyPgw@Jk2_`Cg8l^{9uJpkD}R-b^>nd2ZK})eG9~LiTuZw@k^|~{YRAK znLNL^(u{F! z4#(-eW*hP1T4&shDOr#g(PVmC3-24mxuOivK))W`U*Hw1i(> z;5*kd*elpIB|!za!_{?Vb)KJLTHLw#pS{&mzNX>gXGtgZrJ9M5)Yva}Ul`fgX`sQ3 zo#$S)*=uT3ml=eRsYJ^{@0HoMlh>&*d>InYM^14!d|w>LQ+jI(h_RN%wv{~R8e z_Od4LJiFEZ6hHImLv!6ELCKRAxpS3eIKbI1K`@eXvir`?LcHf?$~G8S9>VthT3>&b zofn0qtUn0rOvKI~4;(5AGR1uzA1>=sKgct%{N|1tJ8tT0`<8q1s}~Q-9y539`vZSB zLt0;@L3)r6*P^7s;IuG7*{}%PxBm>jKe^WWv@#PnaU1uT_>Vmkr>><#yf!}a=kfCJ zc#&{>tb|b&xJhj>s3!GDjM(4lpu3Xk1UW^LpHlEzEdSRCOAC z_J`Z6ddAsMK#+zrs)iJDP{6D3sJhSpOy>RTB_YakQUY&z->wTv&i_iUp-YMYPo<(l zHw67xu#9P3x`SbTOQ@vbF$IH}bD5*os$sC0D#IEe5sb%v?8lMsvJky;73J38eA8)$ zbU$8Ur(Io1y0rkWJK}p~hJaRRhsWx>eZDpdPio3jHppeGa3mq`3^UvxOU@lF!CK8$Y?Mh!DJIXZbc|8@9O@d$ou#FwvGaN$o=hZGzd zt9yfzQ1v#inRHg|$hM`!p3KhB2bXu;qb-?0?X8aK%YHpT6&8Xd)RY_zX}~DAV;nd@VH} zsrCbeh3i-!264le4VPji@vruY4iTy&xqr88J&)TnRL)qp{Cr*he28}NINjtV*Nu!G^eK;c) z#wU>s;u_C@F(PmMup8J%7u~fMy^K%w2faB>owlHz*$fiK&+8H_92IM#}Q2*jmg4m&`Th8Y?^{H(8Qmws1<; z%1mWYjngBtb2kXQ!2gr#)wOb_o^8Vi!`}8a#$7}Cb!CqIGzB9T@@GKWgxxgdMWjCP zpnhc$Gc>YuarSAz#v@K!KX=OXlAT3VzstKwpll)8Elk=o%;%o)&03ba?Yh8+n+T6Y z1zo5SuZ7)9kw$wv@kzwBi1((#RiaRCF_k%GS6Q-z7seZg-h^?^$?ccsNd0Az-rl*DxtNI{LJqfN!f)M zhS1KvpR`6t9L%?$gk)s%@q=X$i-HVgn5JqK8VyzR`TdMde5)1`J%Ad#AY|uznB7Md z!r^K*r(JhPE2ij0oxFf%E!&p%hI$*f(Ta$w$>|FeFCxLy7WsqieO=X~jFI$Kk-4um z6-a>v7Lv2MdYQ2*^7S#}U9G!0hY_o@iS8Qt-QehZP{UK=60k8dz9p}KFYwJzQ;euq z^8q1q(zJnO7p=&v___VQs22Pr5@`$0+TATH+LURcFpLUg_hU&_C$!L_*wmSq>bBBOMl4` z<81K%nRC+pnAyE>ab>L8yUkFjez%a){UcdonAX|iBsJWIT6bd_W3JsJrfwJn#U^1W zP?7Y0w^?>~&u2u?8M2lt>k?g;d`f1?3`oN53SeiF8GC{V0yS` zNXET;4cq!G3QxHaP3eAI<1uj4VTW=+@Drn8&Wh9+q~@4AGlDOt1tWT034Rk?>}N*m zx50X^5*7*^+WvgYDA#`w-F>VDHd@#bAr-zCO9|1RE-4|aA)FBjib%=wN(0YVA5fOf zwMGpmZdumncd1=Ra#&lDc$*j;rleh`2^Yx)qW=)lSGt3(Tqi?^4hL^sn-k@tl=xA; ze`o2Rs~30vx+^w_`KAkV_(R`zJAHdqNJLji**?+aIvmgd7hLFuicxiVlC|Kb#OpK zx0_J36&K2GRr|UT%un?s->OHD@3L9_5uWv;Lo<|+)z^nj1z9H?vlM~J4{lLNp32o{ z-*KddiL(CrlwhpJ=H!NGIY!9bsQCusJNnt6doau3j$wXUDqm`A2NQ3s&V6eIH5JB2 z;XYj0q}a+pG}iX}^nl@JZO`!Z<{40$3`f&`D1TdgAJwgSW07#4EuR;;Zhds&)x2)> zef)=yl=dg*QAu6FKk{A+sg5VRjxUPh<>s5sZiMtdCyJV-4LOsyey$)*SfgFe(ea1Z z6c>VEoZbff&Hlnl-6uouS9Tj~rt@vcM0};wM5H>73`t?HxyFUU~yS3-qPT^%31$e z3m{t8_&}UTM$`t@SuhG%=^Z2&CTcFBW2WdCn{Q z#hq%W5m37GgKZ6DeJaeOk6IcdiDY>Z+}ac9_0?%tK3TkU60kw__cpT2Ibb6UYd0LR zb!`d4_md?3RpZoB%k|Az%KYa81N-R6TX?QMU9)xeh}Ur}iYSOR52vke*NE?iqX;|a zq)PnDSVLmQnScsofyTZ_e_%_cL!O{|V>1D5v#P?}9;Tlzjm6JKzd)=vXE1Do(SXoh zwxr98rg0+Hr6pdfS7m885p<5wZ;bLkDWq>CE++|b7f6J?C|&4k zzIgEV8re=QCUb?++sx{;nUGKJ?jyqs4K%es9qhhURq-nQ=^o}*vf}anz+gU+_$1m6 z{+6u%;eNSkXYb;KLo=cL$kt75Vuk%l`jkqDIKcc*+Ao`>?l$$BOtT0a&ZSGk4&Xd*AQ%Oo*mE2yv|! z|E++A-^?6ac+=W)>#_;*T(PoxfQ>pLI_3XWBdn;c_--zVvuv+gG;hZmBWhX)rOose=m`z8= zg&r55h1EjL^!5n@V(wpL9#-%!NljJ^e8U+g$m-Tu5i;^57E}6)z!+qxr?3(zP&Wr~ zMYjE=E45)bA>EX=GcQV@<19Jt{vwPtcw%QjOy#$s*_TKB$iUHeF~^Up`SOD2x?qP` zp7xynC##6UoE4X?>X)Bbniw6~yLr2Yo3Ge};`Di}$XyhBk>n+^X_NX-_{R*!ppS1} zT}4s-c{2e2>t&tayM!a3&1mR&QJ^}PPPcM|^So=KWQvJ!lFKKqo>Lp$-;Ocsx)$vL z-E@a?$qp+!Z#TIAe6H|rUL~wmJnYOEqw3Qq_yTka9cHwp{~$4+2(*3%=$TpJRV&oL z(@>{g7brOe?IU2A%y(Ry;`qt+@lsDQ19SVqNfG_@BEP& zQ;R|QvEV?h^fqj<9vio}(aPXudXcT?+xVi(NK<}=a{#f?TV$-*xM|gV=ycEEah*!Ji#{efBi* zP-7%FqLx1eP$M z(;4pDlIv?VM+imw@5c8Fp3I&;Gfdh}l9%xFR<3G6O%AAvpYc&5`NHTHiO{0QtI;*GN2^zbq<$osh$*crGr>#8Ybb2^j|6tWe*5kN;DtdkTW8Y#(SiV zFz)mjm#6Ok;^O({v+$tXTGr9Zb#!qv>^4SIyk=7zQ|66#3D1v~LR+J{?e-m}huyaO z?K)GWK+YT$xE(zSh@Cdo#v{(P!;DBKg*7oy?O6f>$$dp-wbcnVs&%Gn$t&H}hHVbf zR6;VA9m$_xaCqO`CPtPqdC)9^2!9-U@=)2#y0M5m@OZEXS*Yo#Fs10#jZdicQAKMIaPfY3O@~6)Ll=6a(4{O2F z&IpBS;!MOL5Bf%jucBpbxYoE^zy4qCTxT>~f3zith?3|rF-j0Zh&Fmp5Iu-a45E%2 zJ%}28M6b~qb%-`&v}i#Lqh>Ip*AOLYq9t$s|MlKm>%EWf^ZR($I(Mym&)WBQ&fRD4 zvnQ6F-$VcEcp_$CBe=TWeNo3dOG6H+Ts#IbAK_(+|F)>AXq3OhY`k$%MO9s_d|&d9 z7hJFeaitQK>g3RldS=|D`L5sHUx_F9O9)9o#q9bo^7qSX@{Pnh){43png$C4n_#KX z00eUBpSOK%kwYVJ4wEU3|9HSPQm2VRC4(Dt--x3h(hl`(y=gGw+M&)PIMu0{!xEiS zGqdVhHUlL*HFzrrWs8i0Q6YB}T4(K<_287*rqb54NA(4~Av{i5_jAHlu2(OX0@YSt z33sKXHzNg_$xWjbw<))%?8<^n7lZ<-NqT*rRP^^SXvezPJyjPj*{YF@#vMEkc&gU& z_-)i#RnUfz+_2Q(?^VVPFCHuAF|qT@0WY7QE`TeN*?V$vp<0?tfuM-=o?(BbH242)~GR1?4AWL?#zaEAC4W_HNCS1k2Hd z+XAvf7h#ehp!N z#*Pgo$e6M4=h=Ur8l5WsuHP28gjjY>@fSW*JdOM($c0`Kt!CyUh$wNiq4A>b)0|Oj z>`LtN$XoF57+&>snURZT-|(>SRF>bqfU5q0<~Kd4TNaV!0VFuZi^kLWV?f|}yc^7h zd)+Boo;`3gvbwsf?y?xtfCfr4`1XBe_w7uXc{x^T`4tv&@Cwt=jE2ldyQnDR>xd`Y?98ptCjAct@ z(GezYRSI}U8;r+x%7hV-fEjY4re}g5JlN&c zwHCtc#e^@Uja`}8SI#b_b9CI)G#A2u~hx193*z*p|` zyiSRsdJw}X^`l#JH%qJ~SPx87ldDv@##)U~|4Woc#ybL*VoAz7_&1$U9jLOy18T8mg}cMclg5+Wm# zB8s(rS*uoz7bHCzOyWSWJHSsCPSh02rvEMp|ct@tr{{W<2lbM4a-l^?MI+%E?sQ%dRqnro*UyP zK@lTG&|bb^lp99;V(uK1nb(lp@fJ>VbowLdr>@XdPC-wVDj@>zr^#^7);Ig`$AITY zlKEbqKpY83W_J7?o#W#KM;8k0k<~gS+y+kV1o0Jr@rEqDM#4d`W3-*=ea$1Mcm0%0 zQ(WG0KI(T@YCgpBwS3Ew#&m@E64)|6l;N3i=1gOUWBTcR*k;}KhHK=tXF+!Fua-!Q zSdx5G9EpSthU|)2vN(%t3p@4V4SuI z@kc5Tvq>1EU!X*jf}Uxrv=lAO93@wJn16~!`D}Z@eq1tNOhSm#lxu5v+O>rnInR_> zulSjTVjyoOL+)Lw=b4y)wE6~mZZ(@@QaoE?*`?m~<_Q=tA7_aw53{tQsJNGNuau=* zw^4IG4(uG;SK?~fc;8Ft@DANRJq>6m)+;9g2s81}&D?{x%tk~&Fm*r8cEk5HJ&&n* zOiUac6|n2g0EEI2!Dj1xE#HyZy?My8ajp3sRs9wK|9oD5v3E9_dB{?c#CbMPE@JYq zL;NO`=r&P}w487Ll;O3A^6P57M4K2@zoeZ2u*~&4$JkxKzwu$%OMe zbegbxBt2SMX?oum?d4|^p&?~XEgc{;En(>6w%^U$@Wg(;gS^}z*A=_aX0mX#(N?pH zDfv>`xIr)sMO@l90WhEeURKC>wpTTE6RSkh_6$^gYex1lYLC3-5fy8U#tvmbI-mS< zD^81*<2zVLuI?{UYLgBGv{BdX?HTuDECc4ZZIc|@9BW`kDr@;QNg+|GGKC&Fy2~yA&pTk7!#YM3_Lr`gcS&-pi!Ocb@ne5TAwlcr z1D_tQuD@Cq6*hgQQ}8u=$0e!%lpAx0Mak^hN7Cf} z693#E_LJg+ z&fm-`ATtpnHIq-^YGqCT;c5|#D$w{xy-EimUo1RaX&PU9uJ0m;Ym)o7%U|o9lzBRB zc$Msr)ddYWKDr5yPeIPj{I2fY-{``N+70dYQpIE$z8%Ij%H?zLo)y14PV8_0OXOyz zKY1=2P4|P|g26SeD**slRO0JgHTM$qXBI>vQ~h0_<^5jcE~bGRvR}oVDsQwC(bv=2 z8i{ls|HMl!D%gMb2*$P7Xk2csXcJ0)4I z?olQvQ|SOYD!8{7KKYBeH2}b;SoLxmakSu{I5Bz?9$q4X-j=Pc#0MlE58j&L!cFV} z1g`;sZ++_SIm-VY(J^%FnqErxw|O$h4aD;!lG1s?_cua4hTE%K#G}I|iinluh#l#A z-k8mO*teZ`=*15g2P(00v;Cg8X^fr+ihhkf5)PcN+6fK3{I>A>^^2k3%T7+-rmK`6 zoTtV01<~H*PHwdov;H}5-y`r-MLfz1TuJ9C)l@UNVaak_)KjJYdzS+qu9N&&S*MWj zSl{<2fa=BF4SKU2q5ZrFlSC>T?0xgpK%+>y(NoHjjRT76l8wr_SK0fiUoDJmR0dkV z^zTJb@cNh1TORMm*XPLZ2hIO-_KxHJt(y_B2!SUVaw(U>1QWft<(2i*3NqX?7tV9@ z(bucE<29oV#x-7~$_ymCZ8>SQo%+tPVxvrTUCWESdgq14fuR-kd0_EnTi&;dd~GAn zI@G4iJ!rx*a0v^7wGa&2kbn=tTcaL9N!Id1;z9SH|M=0~xY$@#Xt)(xc}4a6W!>+^ zGn)pRne)a_t>EhF!jo&k&%E;e#zHP)w^!)DE!cSTH?XKu-_g#vD3rZcUv}*M?^_Lc z6>x)-$fH}I|CKnYdkFb#X68X@NC6^|){P|-43jvXH~i&(8w@E94!`2ur{vY|Z=|FM z8y$E;)L9;ehe%8=WFjUknkM4HBm0!$2EPo@9S#=$w+CHCE*+l+Ukx7`rR^YHemK4B z$v(?IOInITUC+{Nr<+#Sn|g@Zk2*Aeghr(@;WfVK4v}~7lfG|%;wi0c{n$Yy#$Kfu zDh8mieO7H`#FP2bKTT#7+@AV1>8`4h`Q}?aNUImna=20BVTFdwF!CV3D5OIY3angc z3jAZ@3?%ifjyN&D5u;nq@Mv~=QF3Hd$;LB6V;dMvOG~I`|9gFntC(Cj$0g@%TV+eh z3en#8RAc)(L+!ptyYJ^RpEjdt6cX)dHy)HWgX$!9vd6Rs|j>UNtEu2#jrj9!vR9sxl?D1zXd=!E%na>6R!^AIe&1DS#AePg=mZ<^nGKTE?TMiY-(P zSrt2uFG5rnPf0&la=z?x^%Eoy^S61Z@5GFaxhjRr?$wV{$U%qad104%?8Y;ghg%{^ zC%Jd{s=Jd!s!EFltW6oktA-yfImDzTS8S7J&(o89ThRwiH zQ%KT`nAX^RlV{66!MgF3x4XnP{~8QD$eSU!+^`c zHn(Rg%6&>KOqm4cZ&ud=(cR6WEy#HxaKoFeV7vQYyiPFvFdo>RYvHWDSyvw!uAoCK zTal(Jw9`V=PHfl&JhB}CLYvG(c>VH+O;n}`SqAG~LXK~Yf!9-Cu6>iwsL6%+VQ~w{ z36KM3$HESd`QtjzGsB`o$~;HAj= zVP4+))d7F=m0;{ZfqR4Yb1%{IOoex9=a+{gV$Bi(xrBw1@fdY}A&9BB=y=Ae( z1AX)j60INAUs&l#I}$}|uEjjFHmYiJ>1&%;1Iuxm(r54HHqJsZ;NENThw5p4I`a1B z&*ICo7AVaQ+b2AG56FDZ8PMLcw){Lm8;csm?rK;DrSkKvvX*fBCQH>;%S@VXHh81y z^G~aT@$$uevGJ8XIr!l2PO8b7P15@0D|oLHbHk}y@apqh%Yahwnab=8s02zx!%=$E z+4)%IL`+LD_gkOCmVjrh+UnH`+g(`effy7cjWQE;sULZdZ-veUo+)sL%@Y5A zq&2sFcHJX}mGqdnDoSMQr!6c&Io+|;wBsxw zR6y~go6mPdimORpP;dV5`l7y;g^UM3;= z@GLx4U5jfYux3%~V=(u0l@+c9x3o93YmxYBs#$XQrAqVbMsu?{aRuWz`@PS))jL2+ zJ>xH~8TttUs**kz%ixi9MypN<_IzrS z?kOFH=l*tow%VGU_`ZzhP0zT`w4f73x@9bzK0O+Q&Y^5CCRO`{pML4$ zJysUj!iT?aVzuzab?HdzMnE2XFkA38#Gktfk*SrJV?G!jAsj4YY23i3_7VO`!>DA+ znHToD>i8gHozJ1Fs))!(E0V-?pdkY4L|o)a)M=|<*lAx_>0zDY_^Y<*l={cYQYo=r&^`e-bi<>mSONe2z1=JKeMT~cr-RGSwDhWw^|QP$2k5QC1rQ2u4LN7 zOnHBtnfyNkEu{y7AoJWw*ca#JyHud@J7sSf()0qeg-lqW{L0BenHgFRPnFGY@B~SA zxb53K9wB;0b1RJDo<$K`2qcc#*o@SKxj|hllC$m+WiyrFq`E?R*9|;Uc^_}xYrE!k zvGAG2YoT3eEAvyUff>P;c@;AOyiG8iI!IVc=L;yJ%!1E+>g7ExTX#3iYaxwp4@&r6 zpjR01Ajg?&v|z@bc%~lu!+~r}_RH!Uxr@PHkp8aGytdKAwJuNm(!QzQMu;;7w{xVd zqE;2t(OAndpYR4buH&rhr4e9AX2THx%`!I>fc9ncEZeD;E%upR@TQyxOajiPr=C|T zfE*48>YB6wWg#5 zh76{5*M~ONv;L#z=*N06sCN~21mL~qOGe&3x!d>oQWdmAtOn#rqVIhmD3wSs|NReB O!T(?U9}@C+?mqybvtN<` literal 0 HcmV?d00001 diff --git a/wayback-machine-downloader/downloader.js b/wayback-machine-downloader/downloader.js new file mode 100644 index 0000000..d20a86b --- /dev/null +++ b/wayback-machine-downloader/downloader.js @@ -0,0 +1,508 @@ +/* + * Wayback Machine Downloader 0.1 by WhitelightSEO — Interactive (Node.js, ESM) + * Run: node downloader.js + */ + +import fs from "fs"; +import path from "path"; +import { fileURLToPath, pathToFileURL } from "url"; +import { mkdir } from "fs/promises"; +import pLimit from "p-limit"; +import { load } from "cheerio"; +import { Readable } from "stream"; +import readline from "readline"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +// ----------------------------- PROGRESS BAR ----------------------------- +function renderProgress(current, total) { + const width = 40; + const ratio = total > 0 ? current / total : 0; + const filled = Math.round(ratio * width); + const bar = "█".repeat(filled) + "-".repeat(width - filled); + process.stdout.write( + `\r[${bar}] ${Math.round(ratio * 100)}% (${current}/${total})` + ); + if (current === total) process.stdout.write("\n"); +} + +// ----------------------------- HELPERS ----------------------------- +function toPosix(p) { + return p.split(path.sep).join("/"); +} +function relativeLink(fromDir, toFile) { + const rel = path.relative(fromDir, toFile); + return toPosix(rel || path.basename(toFile)); +} +function ensureLocalTargetForPath(pathname) { + return pathname.endsWith("/") || !path.posix.basename(pathname).includes(".") + ? path.posix.join(pathname, "index.html") + : pathname; +} + +// ----------------------------- HTML CHECK ----------------------------- +function isHtmlFile(filePath, contentType, firstBytes) { + if (contentType && /text\/html/i.test(String(contentType))) return true; + const ext = path.extname(filePath).toLowerCase(); + if ([".html", ".htm", ".php", ".asp", ".aspx"].includes(ext)) return true; + const head = (firstBytes || "").toString("utf8", 0, 512); + return /]/i.test(head); +} + +// ----------------------------- Archive API ----------------------------- +async function getRawListFromApi({ + baseUrl, + pageIndex, + all, + fromTimestamp, + toTimestamp, +}) { + const cdx = new URL("https://web.archive.org/cdx/search/xd"); + const params = new URLSearchParams(); + params.set("output", "json"); + params.set("url", baseUrl); + params.set("fl", "timestamp,original"); + params.set("collapse", "digest"); + params.set("gzip", "false"); + if (!all) params.append("filter", "statuscode:200"); + if (fromTimestamp && Number(fromTimestamp) !== 0) + params.set("from", String(fromTimestamp)); + if (toTimestamp && Number(toTimestamp) !== 0) + params.set("to", String(toTimestamp)); + if (pageIndex != null) params.set("page", String(pageIndex)); + cdx.search = params.toString(); + + try { + const res = await fetch(cdx.toString(), { method: "GET", redirect: "follow" }); + const text = await res.text(); + const json = JSON.parse(text); + if ( + Array.isArray(json) && + Array.isArray(json[0]) && + json[0].join(",") === "timestamp,original" + ) { + json.shift(); + } + return json || []; + } catch (e) { + console.log(`ERROR getRawListFromApi: ${e}`); + return []; + } +} + +// ----------------------------- DOWNLOADER CLASS ----------------------------- +class WaybackMachineDownloader { + constructor(params) { + this.base_url = params.base_url; + this.exact_url = !!params.exact_url; + this.directory = params.directory || null; + this.from_timestamp = params.from_timestamp + ? Number(params.from_timestamp) + : 0; + this.to_timestamp = params.to_timestamp ? Number(params.to_timestamp) : 0; + this.threads_count = + params.threads_count != null ? Number(params.threads_count) : 3; + + this.download_external_assets = params.download_external_assets || false; + + this.rewrite_mode = params.rewrite_mode || "as-is"; + this.rewrite_links = this.rewrite_mode === "relative"; + this.canonical_action = params.canonical_action || "keep"; + + this._processed = 0; + } + + backup_name() { + try { + if (this.base_url.includes("//")) { + const u = new URL(this.base_url); + return u.host; + } + } catch {} + return this.base_url; + } + backup_path() { + if (this.directory) { + return this.directory.endsWith(path.sep) + ? this.directory + : this.directory + path.sep; + } + return path.join("websites", this.backup_name(), path.sep); + } + + async get_all_snapshots_to_consider() { + console.log("Getting snapshot pages"); + const httpOpts = { + all: true, + fromTimestamp: this.from_timestamp, + toTimestamp: this.to_timestamp, + }; + let list = []; + + list = list.concat( + await getRawListFromApi({ baseUrl: this.base_url, pageIndex: null, ...httpOpts }) + ); + process.stdout.write("."); + + if (!this.exact_url) { + const wildcard = this.base_url.endsWith("/*") + ? this.base_url + : this.base_url.replace(/\/*$/, "") + "/*"; + for (let i = 0; i < 100; i++) { + const batch = await getRawListFromApi({ + baseUrl: wildcard, + pageIndex: i, + ...httpOpts, + }); + if (!batch || batch.length === 0) break; + list = list.concat(batch); + process.stdout.write("."); + } + } + console.log(` found ${list.length} snapshots to consider.\n`); + return list; + } + + async get_file_list_by_timestamp() { + const curated = new Map(); + const all = await this.get_all_snapshots_to_consider(); + for (const pair of all) { + const ts = pair[0]; + const url = pair[1]; + try { + const u = new URL(url); + const file_id = u.pathname; + const prev = curated.get(file_id); + if (!prev || prev.timestamp <= ts) { + curated.set(file_id, { file_url: url, timestamp: ts, file_id }); + } + } catch {} + } + const arr = Array.from(curated, ([file_id, v]) => ({ ...v, file_id })); + arr.sort((a, b) => String(b.timestamp).localeCompare(String(a.timestamp))); + return arr; + } + + _windowsSanitize(p) { + if (process.platform !== "win32") return p; + return p.replace(/[:*?&=<>\\|]/g, (s) => + "%" + s.charCodeAt(0).toString(16) + ); + } + async _structure_dir_path(dir_path) { + try { + await mkdir(dir_path, { recursive: true }); + } catch (e) { + if (!e || e.code !== "EEXIST") throw e; + } + } + + _determine_paths(file_url, file_id) { + if (file_url.startsWith("data:") || file_url.startsWith("javascript:")) + return null; + if (file_id.length > 200) return null; + + const backup = this.backup_path(); + const parts = file_id.split("/").filter(Boolean); + let dir_path, file_path; + + if (file_id === "") { + dir_path = backup; + file_path = path.join(backup, "index.html"); + } else if ( + file_url.endsWith("/") || + !parts[parts.length - 1].includes(".") + ) { + dir_path = path.join(backup, ...parts); + file_path = path.join(dir_path, "index.html"); + } else { + dir_path = path.join(backup, ...parts.slice(0, -1)); + file_path = path.join(backup, ...parts); + } + + dir_path = this._windowsSanitize(dir_path); + file_path = this._windowsSanitize(file_path); + + return { dir_path, file_path }; + } + + async _download_asset(assetUrl, pageTimestamp, file_path, dir_path) { + try { + if (fs.existsSync(file_path)) return file_path; + + await this._structure_dir_path(dir_path); + const snapshotUrl = `https://web.archive.org/web/${pageTimestamp}id_/${assetUrl}`; + let res; + try { + res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" }); + } catch (e) { + console.log(`Skipping asset ${assetUrl}, fetch failed: ${e}`); + return null; + } + if (!res.ok || !res.body) { + console.log(`Skipping asset ${assetUrl}, bad response ${res.status}`); + return null; + } + + await new Promise((resolve, reject) => { + const ws = fs.createWriteStream(file_path); + Readable.fromWeb(res.body).pipe(ws); + ws.on("finish", resolve); + ws.on("error", reject); + }); + + return file_path; + } catch (e) { + console.log(`Asset download failed: ${assetUrl} → ${e}`); + return null; + } + } + + async _process_html_assets(htmlPath, pageUrl, pageTimestamp) { + try { + const backupRoot = this.backup_path(); + let html = fs.readFileSync(htmlPath, "utf8"); + const $ = load(html); + const site = new URL(this.base_url); + const siteHost = site.hostname.replace(/^www\./, ""); + const baseDir = path.dirname(htmlPath); + + const downloadTasks = []; + + // ----------- ASSETS ----------- + $( + "img[src], script[src], link[href], source[src], video[src], audio[src], iframe[src]" + ).each((_, el) => { + const attr = el.tagName === "link" ? "href" : "src"; + const val = $(el).attr(attr); + if (!val) return; + + try { + const abs = new URL(val, pageUrl).toString(); + const u = new URL(abs); + const isInternal = u.hostname.replace(/^www\./, "") === siteHost; + + if (isInternal || this.download_external_assets) { + const file_id = u.pathname; + const paths = this._determine_paths(abs, file_id); + if (!paths) return; + const { dir_path, file_path } = paths; + + if (this.rewrite_links) { + const normPath = u.pathname + (u.hash || ""); + const localTarget = ensureLocalTargetForPath(normPath); + const localAbsPath = path.join(backupRoot, localTarget); + $(el).attr(attr, relativeLink(baseDir, localAbsPath)); + } + + if (!fs.existsSync(file_path)) { + downloadTasks.push( + this._download_asset(abs, pageTimestamp, file_path, dir_path) + ); + } + } + } catch {} + }); + + // ----------- INTERNAL LINKS (pages/forms) ----------- + if (this.rewrite_links) { + $("a[href], form[action]").each((_, el) => { + const attr = el.tagName === "a" ? "href" : "action"; + const val = $(el).attr(attr); + if (!val) return; + + try { + const abs = new URL(val, pageUrl).toString(); + const u = new URL(abs); + const isInternal = u.hostname.replace(/^www\./, "") === siteHost; + + if (isInternal) { + const normPath = u.pathname + (u.hash || ""); + const localTarget = ensureLocalTargetForPath(normPath); + const localAbsPath = path.join(backupRoot, localTarget); + $(el).attr(attr, relativeLink(baseDir, localAbsPath)); + } + } catch {} + }); + } + + await Promise.all(downloadTasks); + + if (this.canonical_action === "remove") { + $("link[rel=\"canonical\"]").remove(); + } + + fs.writeFileSync(htmlPath, $.html(), "utf8"); + } catch (e) { + console.log(`HTML processing error: ${e}`); + } + } + + async _download_single(file_remote_info, total) { + const file_url = String(file_remote_info.file_url); + const file_id = file_remote_info.file_id; + const file_timestamp = file_remote_info.timestamp; + const paths = this._determine_paths(file_url, file_id); + if (!paths) { + console.log(`Skipping invalid URL: ${file_url}`); + this._processed++; + renderProgress(this._processed, total); + return; + } + const { dir_path, file_path } = paths; + + if (fs.existsSync(file_path)) { + this._processed++; + renderProgress(this._processed, total); + return; + } + + try { + await this._structure_dir_path(dir_path); + const snapshotUrl = `https://web.archive.org/web/${file_timestamp}id_/${file_url}`; + let res; + try { + res = await fetch(snapshotUrl, { method: "GET", redirect: "follow" }); + } catch (e) { + console.log(`Skipping ${file_url}, fetch failed: ${e}`); + return; + } + + if (!res.ok || !res.body) { + console.log(`Skipping ${file_url}, bad response ${res.status}`); + return; + } + + await new Promise((resolve, reject) => { + const ws = fs.createWriteStream(file_path); + Readable.fromWeb(res.body).pipe(ws); + ws.on("finish", resolve); + ws.on("error", reject); + }); + + const contentType = res.headers.get("content-type"); + const ext = path.extname(file_path).toLowerCase(); + const looksHtml = + isHtmlFile(file_path, contentType, null) || + ext === "" || + ext === ".html" || + ext === ".htm"; + if (looksHtml) { + await this._process_html_assets(file_path, file_url, file_timestamp); + } + } catch (e) { + console.log(`Download failed for ${file_url}: ${e}`); + } finally { + this._processed++; + renderProgress(this._processed, total); + } + } + + async download_files() { + const startTime = Date.now(); + console.log( + `Downloading ${this.base_url} to ${this.backup_path()} from Wayback Machine archives.` + ); + const list = await this.get_file_list_by_timestamp(); + if (list.length === 0) { + console.log("No files to download."); + return; + } + + const concurrency = + this.threads_count && this.threads_count > 0 ? this.threads_count : 1; + const limit = pLimit(concurrency); + this._processed = 0; + await Promise.all( + list.map((info) => limit(() => this._download_single(info, list.length))) + ); + const endTime = Date.now(); + console.log( + `\nDownload completed in ${((endTime - startTime) / 1000).toFixed( + 2 + )}s, saved in ${this.backup_path()} (${list.length} files)` + ); + } +} + +// ============================= INTERACTIVE RUN ============================= +function ask(rl, question) { + return new Promise((resolve) => rl.question(question, (answer) => resolve(answer.trim()))); +} + +async function interactiveMain() { + const rl = readline.createInterface({ + input: process.stdin, + output: process.stdout, + }); + + let base_url; + while (true) { + base_url = await ask(rl, "Enter base URL to archive (e.g., https://example.com): "); + if (!base_url) continue; + try { + new URL(base_url); + break; + } catch { + console.log("Please enter a valid URL.\n"); + } + } + + const from_timestamp = await ask(rl, "From timestamp (YYYYMMDDhhmmss) or leave blank: "); + const to_timestamp = await ask(rl, "To timestamp (YYYYMMDDhhmmss) or leave blank: "); + + let rewrite_mode = "as-is"; + const m = await ask(rl, "Rewrite links? (yes=relative / no=as-is, default no): "); + if (/^y(es)?$/i.test(m)) rewrite_mode = "relative"; + + let canonical_action = "keep"; + if (rewrite_mode === "relative") { + const c = await ask(rl, 'Canonical: "keep" (default) or "remove": '); + if ((c || "").toLowerCase() === "remove") canonical_action = "remove"; + } + + let threads_count = await ask(rl, "How many download threads? (default 3): "); + threads_count = parseInt(threads_count || "3", 10); + if (!Number.isFinite(threads_count) || threads_count <= 0) threads_count = 3; + + const exact_url = /^y(es)?$/i.test( + await ask(rl, "Only exact URL (no wildcard /*)? (yes/no, default no): ") + ); + const directory = await ask( + rl, + "Target directory (leave blank for default websites//): " + ); + + const ext = await ask(rl, "Download external assets? (yes/no, default no): "); + const download_external_assets = /^y(es)?$/i.test(ext); + + rl.close(); + + const dl = new WaybackMachineDownloader({ + base_url, + exact_url, + directory: directory || null, + from_timestamp: from_timestamp || 0, + to_timestamp: to_timestamp || 0, + threads_count, + rewrite_mode, + canonical_action, + download_external_assets, + }); + + await dl.download_files(); +} + +const isDirectRun = + import.meta.url === `file://${process.argv[1]}` || + import.meta.url === pathToFileURL(process.argv[1]).href; + +if (isDirectRun) { + interactiveMain().catch((err) => { + console.error(`FATAL: ${err?.stack || err}`); + process.exit(1); + }); +} + +export { WaybackMachineDownloader }; diff --git a/wayback-machine-downloader/package.json b/wayback-machine-downloader/package.json new file mode 100644 index 0000000..144a2b6 --- /dev/null +++ b/wayback-machine-downloader/package.json @@ -0,0 +1,35 @@ +{ + "name": "wayback-downloader", + "version": "0.1.0", + "description": "Interactive Wayback Machine downloader for archiving websites locally.", + "type": "module", + "main": "downloader.js", + "bin": { + "wayback-downloader": "downloader.js" + }, + "scripts": { + "start": "node downloader.js" + }, + "dependencies": { + "cheerio": "^1.0.0-rc.12", + "p-limit": "^4.0.0" + }, + "engines": { + "node": ">=18" + }, + "keywords": [ + "wayback-machine-downloader", + "web-archive-downloder", + "archiver" + ], + "author": "birbwatcher", + "license": "MIT", + "repository": { + "type": "git", + "url": "https://github.com/birbwatcher/wayback-downloader.git" + }, + "bugs": { + "url": "https://github.com/birbwatcher/wayback-downloader/issues" + }, + "homepage": "https://github.com/birbwatcher/wayback-downloader#readme" +}