267 lines
8.4 KiB
JavaScript
Executable File
267 lines
8.4 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
/* Copyright (c) Sebastian Hugentobler <sebastian@vanwa.ch> 2020.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
|
|
|
const tmp = require('tmp');
|
|
var path = require('path');
|
|
const fs = require('fs');
|
|
const { Builder, By, Capabilities, Key, until } = require('selenium-webdriver');
|
|
const fx = require('selenium-webdriver/firefox');
|
|
const yargs = require('yargs/yargs')
|
|
const { hideBin } = require('yargs/helpers');
|
|
|
|
const URL = 'https://zeitungsarchiv.nzz.ch/';
|
|
const RETRY_DELAY = 2000;
|
|
const WAIT_TIMEOUT = 2000;
|
|
const TIMEOUT_MSG = `Timeout after ${WAIT_TIMEOUT / 1000} seconds.`;
|
|
const SEARCH_WAIT_TIMEOUT = 5000;
|
|
const SEARCH_TIMEOUT_MSG = `Timeout after ${SEARCH_WAIT_TIMEOUT / 1000} seconds.`;
|
|
|
|
Date.prototype.isoDate = function () {
|
|
return `${this.getFullYear()}-${String(this.getMonth() + 1).padStart(2, '0')}-${String(this.getDate()).padStart(2, '0')}`;
|
|
}
|
|
|
|
Date.prototype.nzzDate = function () {
|
|
return `${String(this.getDate()).padStart(2, '0')}.${String(this.getMonth() + 1).padStart(2, '0')}.${this.getFullYear()}`;
|
|
}
|
|
|
|
Date.prototype.addDays = function (days) {
|
|
let date = new Date(this.valueOf());
|
|
date.setDate(date.getDate() + days);
|
|
return date;
|
|
}
|
|
|
|
function sleep(ms) {
|
|
return new Promise((resolve) => {
|
|
setTimeout(resolve, ms);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Wait for an in progress download to finish and move the file to the correct
|
|
* destination.
|
|
*
|
|
* @param {fs.PathLike} tmpDir Download directory.
|
|
* @param {fs.PathLike} outDir Final destination directory.
|
|
* @param {Date} date Date of the issue.
|
|
*/
|
|
function moveDownload(tmpDir, outDir, date) {
|
|
return new Promise(resolve => {
|
|
let needsOffset = false;
|
|
|
|
// Dates in filenames are off by one before 1894
|
|
if (date.getFullYear() < 1894) {
|
|
date = date.addDays(-1);
|
|
needsOffset = true;
|
|
}
|
|
|
|
let srcFile = path.join(tmpDir, `Gesamtausgabe_NZZ_-_Neue_Zürcher_Zeitung_${date.isoDate()}.pdf`);
|
|
|
|
if (needsOffset) {
|
|
date = date.addDays(1);
|
|
}
|
|
|
|
if (!fs.existsSync(srcFile)) {
|
|
setTimeout(() => moveDownload(tmpDir, outDir, date), 2000);
|
|
}
|
|
|
|
let destFile = path.join(outDir, `${date.isoDate()}.pdf`);
|
|
try {
|
|
fs.copyFileSync(srcFile, destFile);
|
|
} catch {
|
|
// this means it was tried to download a wrong issue
|
|
}
|
|
resolve();
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Enter the dates of the issue to download.
|
|
*
|
|
* @param {WebDriver} driver Selenium driver to use.
|
|
* @param {Date} date Date of the issue.
|
|
*/
|
|
async function enterDate(driver, date) {
|
|
let dateString = date.nzzDate();
|
|
|
|
let startDate = await driver.wait(until.elementLocated(By.css('input.fup-s-date-start')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY);
|
|
await startDate.clear();
|
|
await startDate.sendKeys(dateString);
|
|
|
|
let endDate = await driver.wait(until.elementLocated(By.css('input.fup-s-date-end')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY);
|
|
await endDate.clear();
|
|
await endDate.sendKeys(dateString + Key.ENTER);
|
|
await sleep(500);
|
|
}
|
|
|
|
/**
|
|
* Login to the NZZ archive.
|
|
*
|
|
* @param {WebDriver} driver Selenium driver to use.
|
|
* @param {String} user Username for the login.
|
|
* @param {String} password Password for the user.
|
|
*/
|
|
async function login(driver, user, password) {
|
|
console.log('logging in...');
|
|
|
|
await driver.get(URL);
|
|
await sleep(500);
|
|
|
|
let loginButton = await driver.findElement(By.css('.fup-menu-login-container'));
|
|
await driver.wait(until.elementIsVisible(loginButton), WAIT_TIMEOUT);
|
|
await sleep(1000);
|
|
await loginButton.click();
|
|
|
|
let emailField = await driver.wait(until.elementLocated(By.id('c1-login-field')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY);
|
|
await driver.wait(until.elementIsVisible(emailField), WAIT_TIMEOUT);
|
|
await emailField.sendKeys(user + Key.ENTER);
|
|
|
|
let pwField = await driver.wait(until.elementLocated(By.id('c1-password-field')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY);
|
|
await pwField.sendKeys(password + Key.ENTER);
|
|
}
|
|
|
|
/**
|
|
* Start the downloa of a full issue.
|
|
*
|
|
* @param {WebDriver} driver Selenium driver to use.
|
|
*/
|
|
async function download(driver) {
|
|
let menu = await driver.wait(until.elementLocated(By.css('.fup-menu-item-download')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY);
|
|
await menu.click();
|
|
|
|
let download = await driver.wait(until.elementLocated(By.css('.fup-s-menu-download-edition')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY);
|
|
await download.click();
|
|
|
|
let back = await driver.wait(until.elementLocated(By.css('.fup-s-menu-back')), WAIT_TIMEOUT, TIMEOUT_MSG, RETRY_DELAY);
|
|
await driver.wait(until.elementIsVisible(back), WAIT_TIMEOUT);
|
|
await sleep(500)
|
|
}
|
|
|
|
/**
|
|
* Download all issues in a certain time span.
|
|
*
|
|
* @param {WebDriver} driver Selenium driver to use.
|
|
* @param {Date} from Earliest issue to download.
|
|
* @param {Date} to Latest issue to download.
|
|
* @param {fs.PathLike} tmpDir Download directory.
|
|
* @param {fs.PathLike} outDir Final destination directory.
|
|
*/
|
|
async function findIssues(driver, from, to, tmpDir, outDir) {
|
|
from = from.addDays(-1);
|
|
|
|
while (from.toDateString() !== to.toDateString()) {
|
|
from = from.addDays(1);
|
|
console.log(`checking ${from.isoDate()}...`);
|
|
|
|
await enterDate(driver, from);
|
|
|
|
try {
|
|
let articles = await driver.wait(until.elementsLocated(By.css('.fup-archive-result-item-article-title')), SEARCH_WAIT_TIMEOUT, SEARCH_TIMEOUT_MSG, RETRY_DELAY);
|
|
await articles[0].click();
|
|
await sleep(500)
|
|
|
|
await download(driver);
|
|
console.log(`\tdownloading...`);
|
|
await sleep(500)
|
|
|
|
// do this in the background
|
|
moveDownload(tmpDir, outDir, from);
|
|
|
|
await driver.get(URL);
|
|
} catch {
|
|
// this means there is no issue on the searched date
|
|
// move along with the next date
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Setup the headless browser and download all issues in the specified time span.
|
|
*
|
|
* @param {Date} from Earliest issue to download.
|
|
* @param {Date} to Latest issue to download.
|
|
* @param {String} user Username for the nzz archive.
|
|
* @param {String} password Password for the user.
|
|
* @param {fs.PathLike} outDir Final destination directory.
|
|
*/
|
|
async function run(from, to, user, password, outDir) {
|
|
if (!fs.existsSync(outDir)) {
|
|
fs.mkdirSync(outDir);
|
|
}
|
|
|
|
let tmpDir = tmp.dirSync();
|
|
console.log(`downloading to ${outDir} (tmp dir: ${tmpDir.name})...`);
|
|
|
|
let fxOptions = new fx.Options()
|
|
.addArguments('-headless')
|
|
.setPreference('pdfjs.disabled', true)
|
|
.setPreference('browser.helperApps.neverAsk.openFile', 'application/pdf')
|
|
.setPreference('browser.download.folderList', 2)
|
|
.setPreference('browser.download.manager.showWhenStartingout', false)
|
|
.setPreference('browser.download.dir', tmpDir.name)
|
|
.setPreference('browser.helperApps.neverAsk.saveToDisk', 'application/pdf');
|
|
|
|
const caps = new Capabilities();
|
|
caps.setPageLoadStrategy("normal");
|
|
|
|
let driver = await new Builder()
|
|
.withCapabilities(caps)
|
|
.forBrowser('firefox')
|
|
.setFirefoxOptions(fxOptions)
|
|
.build();
|
|
|
|
try {
|
|
await login(driver, user, password);
|
|
await findIssues(driver, from, to, tmpDir.name, outDir);
|
|
await sleep(1000);
|
|
}
|
|
finally {
|
|
driver.quit();
|
|
fs.rmdirSync(tmpDir.name, { recursive: true });
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse arguments and start the downloading of the issues.
|
|
*/
|
|
(async function init() {
|
|
let now = new Date();
|
|
let nowString = now.isoDate();
|
|
|
|
const argv = yargs(hideBin(process.argv))
|
|
.usage('Usage: $0 -f [date] -t [date] -o [path] -u [usernane] -p [password]')
|
|
.demandOption(['u', 'p'])
|
|
.help('h')
|
|
.describe('f', 'Earliest issue to download.')
|
|
.describe('t', 'Latest issue to download.')
|
|
.describe('o', 'Download directory.')
|
|
.describe('u', 'Username for the nzz archive.')
|
|
.describe('p', 'Password for the user.')
|
|
.alias('h', 'help')
|
|
.alias('f', 'from')
|
|
.alias('t', 'to')
|
|
.alias('o', 'out')
|
|
.alias('u', 'user')
|
|
.alias('p', 'password')
|
|
.default('f', nowString)
|
|
.default('t', nowString)
|
|
.default('o', './nzz')
|
|
.epilog('Copyright (c) Sebastian Hugentobler <sebastian@vanwa.ch> 2020')
|
|
.example("$0 -u 'myuser@example.com' -p 'mypassword' -f 1780-01-01 -t 1780-02-30", 'Download all existing issues from 01-01-1780 until 30-02-1780 to the default directory "./nzz"')
|
|
.argv;
|
|
|
|
let from = new Date(argv.from);
|
|
let to = new Date(argv.to);
|
|
|
|
if (from > to) {
|
|
console.error('"from" date must be before "to" date');
|
|
process.exit(1);
|
|
}
|
|
|
|
run(from, to, argv.user, argv.password, argv.out)
|
|
})();
|