Apple Documentation Archiver

Apple documentation websites are fully dynamic. Simply saving a page won't show any content. So I wrote a script to archive Apple Documentation Archive offline. It's a node.js program. It will take few days around three or so to complete the download. It is recommended to run this in one go.
// Script to download Apple Documentation Archive for offline viewing.
// Run the program using:
//   $ node main.js
// To fetching a specific page and linked resources the link can be specified
// as command line arg. It takes a single url or multiple comma separated urls.
//   $ node main.js https://url, https://url2
// If url is not specified, it takes the one in the program.
//
// Thu 08 Jan 26 - Jaseem V V

import fetch from "node-fetch";
import { JSDOM } from "jsdom";
import fs from "fs-extra";
import path from "path";
import mime from "mime-types";
import { URL } from "url";

const DEFAULT_START_URLS = JSON.parse(
    fs.readFileSync(new URL("./urls.json", import.meta.url), "utf-8")
).urls;
const START_URLS = process.argv[2] ? process.argv[2].split(",") : DEFAULT_START_URLS;
const OUTPUT_DIR = "./offline";

try {
    new URL(START_URLS);
} catch {
    console.error("Invalid URL: ", START_URLS);
    process.exit(1);
}

const ALLOWED_ORIGIN = "https://developer.apple.com";
const ALLOWED_PATH_PREFIXES = [
    "/library/archive/"
];
const STATE_FILE = path.join(OUTPUT_DIR, ".crawl-state.json");
const FETCH_DELAY_MS = 300;  // change to 0 to disable
var visitedPages = new Set();
const downloadedAssets = new Map();

function sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

if (await fs.pathExists(STATE_FILE)) {
    const saved = JSON.parse(await fs.readFile(STATE_FILE, "utf8"));
    visitedPages = new Set(saved.visitedPages);
}

// Remove URL fragment from links
function canonicalize(url) {
    const u = new URL(url);
    u.hash = "";
    return u.href;
}

// Replace chars not specified by the regex with _.
function sanitizePath(p) {
    return p.replace(/[^a-zA-Z0-9._/-]/g, "_");
}

// Get a clean local path from url.
function localPathForUrl(url) {
    const u = new URL(url);
    let p = u.pathname;
    if (p.endsWith("/")) p += "index.html";  // Directory, append .index.html
    if (!path.extname(p)) p += ".html";  // No file extension, add .html
    return sanitizePath(path.join(u.hostname, p));
}

function isHtmlContentType(type) {
    if (!type) return false;
    return type.includes("text/html");
}

// Download assets. Can be js, css, another html page.
async function downloadAsset(url) {
    if (downloadedAssets.has(url)) return downloadedAssets.get(url);

    if (FETCH_DELAY_MS > 0) {
        await sleep(FETCH_DELAY_MS);
    }
    const res = await fetch(url);
    if (!res.ok) return null;
    
    const arrayBuffer = await res.arrayBuffer();
    const buffer = Buffer.from(arrayBuffer);
    const type = res.headers.get("content-type");
    const ext = mime.extension(type) || "bin";
    const u = new URL(url);
    const filename = sanitizePath(
        path.join(u.hostname, u.pathname)) +
        (path.extname(u.pathname) ? "" : "." + ext);
    const fullPath = path.join(OUTPUT_DIR, filename);
    
    if (!(await fs.pathExists(fullPath))) {
        await fs.ensureDir(path.dirname(fullPath));
        await fs.writeFile(fullPath, buffer);
        console.log(url);
    } else {
        console.log("Skipping " + url);
    }
    downloadedAssets.set(url, filename);
    return filename;
}

// Process the given url.
// Fetches the html pages, extracts links. If it's assets, download them. If
// it's html page process the page recursively.
// All asset path will be updated to point to local.
// Fetched urls are cached to prevent refetch.
async function processPage(url) {
    if (visitedPages.has(url)) return;  // comment this to fetch irrespective of cache.

    const canonicalUrl = canonicalize(url);
    if (visitedPages.has(canonicalUrl)) return;
    visitedPages.add(canonicalUrl);

    if (FETCH_DELAY_MS > 0) {
        await sleep(FETCH_DELAY_MS);
    }

    const res = await fetch(url);
    if (!res.ok) return;

    const html = await res.text();
    const dom = new JSDOM(html, { url });
    const document = dom.window.document;
    const baseUrl = new URL(url);
    const pageLocalPath = path.join(OUTPUT_DIR, localPathForUrl(url));

    async function rewriteAttr(el, attr) {
        const value = el.getAttribute(attr);
        if (!value) return;

        try {
            const absolute = new URL(value, baseUrl).href;
            const assetLocal = await downloadAsset(absolute);
            if (!assetLocal) return;

            const assetFullPath = path.join(OUTPUT_DIR, assetLocal);
            const relative = path.relative(
                path.dirname(pageLocalPath),
                assetFullPath
            );
            el.setAttribute(attr, relative);
        } catch(e) {
            console.log("error: rewriteAttr: %o", e);
        }
    }

    for (const meta of document.querySelectorAll('meta[name="book-json"][content]')) {
        const value = meta.getAttribute("content");
        if (!value) continue;

        try {
            const absolute = new URL(value, baseUrl).href;
            const result = await downloadAsset(absolute);
            if (!result) continue;

            const assetFullPath = path.join(OUTPUT_DIR, result);
            const relative = path.relative(
                path.dirname(pageLocalPath),
                assetFullPath
            );
            meta.setAttribute("content", relative);

            // Download sample code zip if present.
            if (result.endsWith("book.json")) {
                const jsonText = await fs.readFile(assetFullPath, "utf8");
                const bookJson = JSON.parse(jsonText);
                if (bookJson.sampleCode) {
                    const sampleZipUrl = new URL(
                        bookJson.sampleCode,
                        absolute
                    ).href;
                    await downloadAsset(sampleZipUrl);
                }
            }
        } catch(e) {
            console.error("error: book-json: %o", e);
        }
    }

    for (const el of document.querySelectorAll("img[src], script[src], link[href]")) {
        const attr = el.tagName === "LINK" ? "href" : "src";
        await rewriteAttr(el, attr);
    }

    for (const a of document.querySelectorAll("a[href]")) {
        const href = a.getAttribute("href");
        if (!href) continue;

        try {
            const absolute = new URL(href, baseUrl);

            const isAllowed =
                absolute.origin === ALLOWED_ORIGIN &&
                ALLOWED_PATH_PREFIXES.some(prefix =>
                    absolute.pathname.startsWith(prefix)
                );

            if (!isAllowed) continue;

            if (FETCH_DELAY_MS > 0) {
                await sleep(FETCH_DELAY_MS);
            }

            const res = await fetch(absolute.href, { method: "HEAD" });
            if (!res.ok) continue;

            const contentType = res.headers.get("content-type");

            // Non HTML content like zip, pdf, etc.
            if (!isHtmlContentType(contentType)) {
                const assetLocal = await downloadAsset(absolute.href);
                if (!assetLocal) continue;

                const assetFullPath = path.join(OUTPUT_DIR, assetLocal);
                const relative = path.relative(
                    path.dirname(pageLocalPath),
                    assetFullPath
                );

                a.setAttribute("href", relative);
                continue;
            }

            // HTML content
            const canonical = canonicalize(absolute.href);
            a.setAttribute("href", localPathForUrl(canonical));

            if (!visitedPages.has(canonical)) {
                await processPage(absolute.href);
            }
        } catch {}
    }

    if (!(await fs.pathExists(pageLocalPath))) {
        await fs.ensureDir(path.dirname(pageLocalPath));
        await fs.writeFile(
            pageLocalPath,
            "<!DOCTYPE html>\n" + document.documentElement.outerHTML
        );
        console.log(url);
    } else {
        console.log("Skipping " + url);
    }

    await fs.writeJson(STATE_FILE, {
        visitedPages: [...visitedPages]
    });
}

(async () => {
    const startTime = new Date();
    console.log("Crawl started at:", startTime.toLocaleString());
    await fs.ensureDir(OUTPUT_DIR);
    const queue = [...START_URLS];
    while (queue.length > 0) {
        const url = queue.shift();
        await processPage(url);
    }
    const endTime = new Date();
    const durationMinutes = (endTime - startTime) / (1000 * 60);
    console.log("Crawl finished at:", endTime.toLocaleString());
    console.log("Offline site saved to", OUTPUT_DIR);
    console.log("Total duration (mins):", durationMinutes.toFixed(2));
})();

/* Helper scripts */

/*

Get all links from the current page. Execute in dev tools console after the page
has been loaded and the table of content is displayed if present.

var links = new Set();

document.querySelectorAll("a[href]").forEach(a => {
  try {
    const url = new URL(a.getAttribute("href"), window.location.href);
    url.hash = "";  // remove fragment
    links.add(url.href);
  } catch () {}
});

console.log([...links]);

*/

/*
 
Copy dynamically appended body contents from the archive's main page. Run this
in chrome dev console.

copy(document.body.innerHTML.replace(/\n/g, ""))

*/
The full program can be downloaded from the GitHub repository.