#!/usr/bin/env node /* Copyright (c) Sebastian Hugentobler 2024. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ const tmp = require("tmp"); const path = require("path"); const fs = require("fs"); const { Builder, By, Capabilities, Key, until } = require("selenium-webdriver"); const fx = require("selenium-webdriver/firefox"); const yargs = require("yargs/yargs"); const { hideBin } = require("yargs/helpers"); const URL = "https://zeitungsarchiv.nzz.ch/"; const WAIT_TIMEOUT = 3000; const TIMEOUT_MSG = `Timeout after ${WAIT_TIMEOUT / 1000} seconds.`; const SEARCH_WAIT_TIMEOUT = 5000; const SEARCH_TIMEOUT_MSG = `Timeout after ${SEARCH_WAIT_TIMEOUT / 1000} seconds.`; const DOWNLOAD_TIMEOUT = 20000; const USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0"; Date.prototype.isoDate = function () { return `${this.getFullYear()}-${String(this.getMonth() + 1).padStart(2, "0")}-${String(this.getDate()).padStart(2, "0")}`; }; Date.prototype.nzzDate = function () { return `${String(this.getDate()).padStart(2, "0")}.${String(this.getMonth() + 1).padStart(2, "0")}.${this.getFullYear()}`; }; Date.prototype.addDays = function (days) { const date = new Date(this.valueOf()); date.setDate(date.getDate() + days); return date; }; function sleep(ms) { return new Promise((resolve) => { setTimeout(resolve, ms); }); } /** * Wait for an in progress download to finish and move the file to the correct * destination. * * @param {fs.PathLike} tmpDir Download directory. * @param {fs.PathLike} outDir Final destination directory. * @param {Date} date Date of the issue. */ function moveDownload(tmpDir, outDir, date) { return new Promise((resolve) => { const srcFile = path.join( tmpDir, `Gesamtausgabe_NZZ_-_Neue_Zürcher_Zeitung_${date.isoDate()}.pdf`, ); if (!fs.existsSync(srcFile)) { setTimeout(() => moveDownload(tmpDir, outDir, date), 2000); } const destFile = path.join(outDir, `${date.isoDate()}.pdf`); try { fs.copyFileSync(srcFile, destFile); } catch { // this means we tried to download a wrong issue } resolve(); }); } /** * Enter the dates of the issue to download. * * @param {WebDriver} driver Selenium driver to use. * @param {Date} date Date of the issue. */ async function enterDate(driver, date) { const dateString = date.nzzDate(); const startDate = await driver.wait( until.elementLocated(By.css("input.fup-s-date-start")), WAIT_TIMEOUT, TIMEOUT_MSG, ); await startDate.clear(); await startDate.sendKeys(dateString); const endDate = await driver.wait( until.elementLocated(By.css("input.fup-s-date-end")), WAIT_TIMEOUT, TIMEOUT_MSG, ); await endDate.clear(); await endDate.sendKeys(dateString + Key.ENTER); await sleep(500); } /** * Login to the NZZ archive. * * @param {WebDriver} driver Selenium driver to use. * @param {String} user Username for the login. * @param {String} password Password for the user. */ async function login(driver, user, password) { console.log("logging in..."); await driver.get(URL); await sleep(500); const loginButton = await driver.findElement( By.css(".fup-menu-login-container"), ); await driver.wait(until.elementIsVisible(loginButton), WAIT_TIMEOUT); await loginButton.click(); const iframe = await driver.findElement(By.css('iframe[id^="piano"]')); await driver.switchTo().frame(iframe); const emailField = await driver.wait( until.elementLocated(By.css('input[name="email"]')), WAIT_TIMEOUT, TIMEOUT_MSG, ); await driver.wait(until.elementIsVisible(emailField), WAIT_TIMEOUT); await emailField.sendKeys(user); const pwField = await driver.wait( until.elementLocated(By.css('input[type="password"]')), WAIT_TIMEOUT, TIMEOUT_MSG, ); await pwField.sendKeys(password); const submitButton = await driver.wait( until.elementLocated(By.css('button[class="btn prime"]')), WAIT_TIMEOUT, TIMEOUT_MSG, ); await driver.wait(until.elementIsVisible(submitButton), WAIT_TIMEOUT); await submitButton.click(); await driver.switchTo().defaultContent(); await sleep(500); const loginMenu = await driver.wait( until.elementLocated( By.css(".fup-login-open.fup-button.fup-s-menu-login-open"), ), WAIT_TIMEOUT, TIMEOUT_MSG, ); await driver.wait(until.elementIsVisible(loginMenu), WAIT_TIMEOUT); } /** * Start the download of a full issue. * * @param {WebDriver} driver Selenium driver to use. */ async function download(driver) { const menu = await driver.wait( until.elementLocated(By.css(".fup-menu-item-download")), WAIT_TIMEOUT, TIMEOUT_MSG, ); await menu.click(); const download = await driver.wait( until.elementLocated(By.css(".fup-s-menu-download-edition-confirmation")), WAIT_TIMEOUT, TIMEOUT_MSG, ); await download.click(); const loadingMask = await driver.wait( until.elementLocated(By.css(".fup-loading-mask")), WAIT_TIMEOUT, TIMEOUT_MSG, ); await driver.wait(until.elementIsVisible(loadingMask), DOWNLOAD_TIMEOUT); await driver.wait(until.stalenessOf(loadingMask), DOWNLOAD_TIMEOUT); const back = await driver.wait( until.elementLocated(By.css(".fup-s-menu-back")), WAIT_TIMEOUT, TIMEOUT_MSG, ); await driver.wait(until.elementIsVisible(back), WAIT_TIMEOUT); // back.click(); await sleep(500); } /** * Download all issues in a certain time span. * * @param {WebDriver} driver Selenium driver to use. * @param {Date} from Earliest issue to download. * @param {Date} to Latest issue to download. * @param {fs.PathLike} tmpDir Download directory. * @param {fs.PathLike} outDir Final destination directory. */ async function findIssues(driver, from, to, tmpDir, outDir) { from = from.addDays(-1); while (from.toDateString() !== to.toDateString()) { from = from.addDays(1); console.log(`checking ${from.isoDate()}...`); await enterDate(driver, from); try { const articles = await driver.wait( until.elementsLocated(By.css(".fup-archive-result-item-article-title")), SEARCH_WAIT_TIMEOUT, SEARCH_TIMEOUT_MSG, ); await articles[0].click(); await sleep(500); await download(driver); console.log(`\tdownloading...`); await sleep(500); // do this in the background moveDownload(tmpDir, outDir, from); await driver.get(URL); } catch { // this means there is no issue on the searched date // move along with the next date console.log(`\tno issues`); } } } /** * Setup the headless browser and download all issues in the specified time span. * * @param {Date} from Earliest issue to download. * @param {Date} to Latest issue to download. * @param {String} user Username for the nzz archive. * @param {String} password Password for the user. * @param {fs.PathLike} outDir Final destination directory. */ async function run(from, to, user, password, outDir) { if (!fs.existsSync(outDir)) { fs.mkdirSync(outDir); } const tmpDir = tmp.dirSync(); console.log(`downloading to ${outDir} (tmp dir: ${tmpDir.name})...`); const fxOptions = new fx.Options() .addArguments("-headless") .setPreference("pdfjs.disabled", true) .setPreference("general.useragent.override", USER_AGENT) .setPreference("browser.helperApps.neverAsk.openFile", "application/pdf") .setPreference("browser.download.folderList", 2) .setPreference("browser.download.manager.showWhenStartingout", false) .setPreference("browser.download.dir", tmpDir.name) .setPreference("browser.helperApps.neverAsk.saveToDisk", "application/pdf"); const caps = new Capabilities(); caps.setPageLoadStrategy("normal"); const driver = await new Builder() .withCapabilities(caps) .forBrowser("firefox") .setFirefoxOptions(fxOptions) .build(); try { await login(driver, user, password); await findIssues(driver, from, to, tmpDir.name, outDir); await sleep(1000); } finally { await fs.rm(tmpDir.name, { recursive: true }, (e) => { if (e) { console.error(`failed to remove tmp directory: ${e}`); } }); await driver.quit(); } } /** * Parse arguments and start the downloading of the issues. */ (function init() { const now = new Date(); const nowString = now.isoDate(); const argv = yargs(hideBin(process.argv)) .usage( "Usage: $0 -f [date] -t [date] -o [path] -u [usernane] -p [password]", ) .demandOption(["u", "p"]) .help("h") .describe("f", "Earliest issue to download.") .describe("t", "Latest issue to download.") .describe("o", "Download directory.") .describe("u", "Username for the nzz archive.") .describe("p", "Password for the user.") .alias("h", "help") .alias("f", "from") .alias("t", "to") .alias("o", "out") .alias("u", "user") .alias("p", "password") .default("f", nowString) .default("t", nowString) .default("o", "./nzz") .epilog("Copyright (c) Sebastian Hugentobler 2024") .example( "$0 -u 'myuser@example.com' -p 'mypassword' -f 1780-01-01 -t 1780-02-30", 'Download all existing issues from 01-01-1780 until 30-02-1780 to the default directory "./nzz"', ).argv; const from = new Date(argv.from); const to = new Date(argv.to); if (from > to) { console.error('"from" date must be before "to" date'); process.exit(1); } run(from, to, argv.user, argv.password, argv.out); })();