#!/usr/bin/env node /* Copyright (c) Sebastian Hugentobler 2020. * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ const tmp = require('tmp'); var path = require('path'); const fs = require('fs'); const { Builder, By, Capabilities, Key, until } = require('selenium-webdriver'); const fx = require('selenium-webdriver/firefox'); const yargs = require('yargs/yargs') const { hideBin } = require('yargs/helpers'); const URL = 'https://zeitungsarchiv.nzz.ch/'; const RETRY_DELAY = 2000; const WAIT_TIMEOUT = 2000; const TIMEOUT_MSG = `Timeout after ${WAIT_TIMEOUT / 1000} seconds.`; const SEARCH_WAIT_TIMEOUT = 5000; const SEARCH_TIMEOUT_MSG = `Timeout after ${SEARCH_WAIT_TIMEOUT / 1000} seconds.`; Date.prototype.isoDate = function () { return `${this.getFullYear()}-${String(this.getMonth() + 1).padStart(2, '0')}-${String(this.getDate()).padStart(2, '0')}`; } Date.prototype.nzzDate = function () { return `${String(this.getDate()).padStart(2, '0')}.${String(this.getMonth() + 1).padStart(2, '0')}.${this.getFullYear()}`; } Date.prototype.addDays = function (days) { let date = new Date(this.valueOf()); date.setDate(date.getDate() + days); return date; } function sleep(ms) { return new Promise((resolve) => { setTimeout(resolve, ms); }); } /** * Wait for an in progress download to finish and move the file to the correct * destination. * * @param {fs.PathLike} tmpDir Download directory. * @param {fs.PathLike} outDir Final destination directory. * @param {Date} date Date of the issue. */ function moveDownload(tmpDir, outDir, date) { return new Promise(resolve => { let needsOffset = false; // Dates in filenames are off by one before 1894 if (date.getFullYear() < 1894) { date = date.addDays(-1); needsOffset = true; } let srcFile = path.join(tmpDir, `Gesamtausgabe_NZZ_-_Neue_Zürcher_Zeitung_${date.isoDate()}.pdf`); if (needsOffset) { date = date.addDays(1); } if (!fs.existsSync(srcFile)) { setTimeout(() => moveDownload(tmpDir, outDir, date), 2000); } let destFile = path.join(outDir, `${date.isoDate()}.pdf`); try { fs.copyFileSync(srcFile, destFile); } catch { // this means it was tried to download a wrong issue } resolve(); }); } /** * Enter the dates of the issue to download. * * @param {WebDriver} driver Selenium driver to use. * @param {Date} date Date of the issue. */ async function enterDate(driver, date) { let dateString = date.nzzDate(); let startDate = await driver.wait(until.elementLocated(By.css('input.fup-s-date-start')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY); await startDate.clear(); await startDate.sendKeys(dateString); let endDate = await driver.wait(until.elementLocated(By.css('input.fup-s-date-end')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY); await endDate.clear(); await endDate.sendKeys(dateString + Key.ENTER); await sleep(500); } /** * Login to the NZZ archive. * * @param {WebDriver} driver Selenium driver to use. * @param {String} user Username for the login. * @param {String} password Password for the user. */ async function login(driver, user, password) { console.log('logging in...'); await driver.get(URL); await sleep(500); let loginButton = await driver.findElement(By.css('.fup-menu-login-container')); await driver.wait(until.elementIsVisible(loginButton), WAIT_TIMEOUT); await sleep(1000); await loginButton.click(); let emailField = await driver.wait(until.elementLocated(By.id('c1-login-field')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY); await driver.wait(until.elementIsVisible(emailField), WAIT_TIMEOUT); await emailField.sendKeys(user + Key.ENTER); let pwField = await driver.wait(until.elementLocated(By.id('c1-password-field')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY); await pwField.sendKeys(password + Key.ENTER); } /** * Start the downloa of a full issue. * * @param {WebDriver} driver Selenium driver to use. */ async function download(driver) { let menu = await driver.wait(until.elementLocated(By.css('.fup-menu-item-download')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY); await menu.click(); let download = await driver.wait(until.elementLocated(By.css('.fup-s-menu-download-edition')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY); await download.click(); let back = await driver.wait(until.elementLocated(By.css('.fup-s-menu-back')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY); await driver.wait(until.elementIsVisible(back), WAIT_TIMEOUT); await sleep(500) } /** * Download all issues in a certain time span. * * @param {WebDriver} driver Selenium driver to use. * @param {Date} from Earliest issue to download. * @param {Date} to Latest issue to download. * @param {fs.PathLike} tmpDir Download directory. * @param {fs.PathLike} outDir Final destination directory. */ async function findIssues(driver, from, to, tmpDir, outDir) { from = from.addDays(-1); while (from.toDateString() !== to.toDateString()) { from = from.addDays(1); console.log(`checking ${from.isoDate()}...`); await enterDate(driver, from); try { let articles = await driver.wait(until.elementsLocated(By.css('.fup-archive-result-item-article-title')), SEARCH_WAIT_TIMEOUT, SEARCH_TIMEOUT_MSG, RETRY_DELAY); await articles[0].click(); await sleep(500) await download(driver); console.log(`\tdownloading...`); await sleep(500) // do this in the background moveDownload(tmpDir, outDir, from); await driver.get(URL); } catch { // this means there is no issue on the searched date // move along with the next date continue; } } } /** * Setup the headless browser and download all issues in the specified time span. * * @param {Date} from Earliest issue to download. * @param {Date} to Latest issue to download. * @param {String} user Username for the nzz archive. * @param {String} password Password for the user. * @param {fs.PathLike} outDir Final destination directory. */ async function run(from, to, user, password, outDir) { if (!fs.existsSync(outDir)) { fs.mkdirSync(outDir); } let tmpDir = tmp.dirSync(); console.log(`downloading to ${outDir} (tmp dir: ${tmpDir.name})...`); let fxOptions = new fx.Options() .addArguments('-headless') .setPreference('pdfjs.disabled', true) .setPreference('browser.helperApps.neverAsk.openFile', 'application/pdf') .setPreference('browser.download.folderList', 2) .setPreference('browser.download.manager.showWhenStartingout', false) .setPreference('browser.download.dir', tmpDir.name) .setPreference('browser.helperApps.neverAsk.saveToDisk', 'application/pdf'); const caps = new Capabilities(); caps.setPageLoadStrategy("normal"); let driver = await new Builder() .withCapabilities(caps) .forBrowser('firefox') .setFirefoxOptions(fxOptions) .build(); try { await login(driver, user, password); await findIssues(driver, from, to, tmpDir.name, outDir); await sleep(1000); } finally { driver.quit(); fs.rmdirSync(tmpDir.name, { recursive: true }); } } /** * Parse arguments and start the downloading of the issues. */ (async function init() { let now = new Date(); let nowString = now.isoDate(); const argv = yargs(hideBin(process.argv)) .usage('Usage: $0 -f [date] -t [date] -o [path] -u [usernane] -p [password]') .demandOption(['u', 'p']) .help('h') .describe('f', 'Earliest issue to download.') .describe('t', 'Latest issue to download.') .describe('o', 'Download directory.') .describe('u', 'Username for the nzz archive.') .describe('p', 'Password for the user.') .alias('h', 'help') .alias('f', 'from') .alias('t', 'to') .alias('o', 'out') .alias('u', 'user') .alias('p', 'password') .default('f', nowString) .default('t', nowString) .default('o', './nzz') .epilog('Copyright (c) Sebastian Hugentobler 2020') .example("$0 -u 'myuser@example.com' -p 'mypassword' -f 1780-01-01 -t 1780-02-30", 'Download all existing issues from 01-01-1780 until 30-02-1780 to the default directory "./nzz"') .argv; let from = new Date(argv.from); let to = new Date(argv.to); if (from > to) { console.error('"from" date must be before "to" date'); process.exit(1); } run(from, to, argv.user, argv.password, argv.out) })();