336 lines
9.7 KiB
JavaScript
Executable File
336 lines
9.7 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
/* Copyright (c) Sebastian Hugentobler <shu@vanwa.ch> 2024.
|
|
*
|
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
|
|
|
const tmp = require("tmp");
|
|
const path = require("path");
|
|
const fs = require("fs");
|
|
const { Builder, By, Capabilities, Key, until } = require("selenium-webdriver");
|
|
const fx = require("selenium-webdriver/firefox");
|
|
const yargs = require("yargs/yargs");
|
|
const { hideBin } = require("yargs/helpers");
|
|
|
|
const URL = "https://zeitungsarchiv.nzz.ch/";
|
|
const WAIT_TIMEOUT = 10000;
|
|
const TIMEOUT_MSG = `Timeout after ${WAIT_TIMEOUT / 1000} seconds.`;
|
|
const SEARCH_WAIT_TIMEOUT = 15000;
|
|
const SEARCH_TIMEOUT_MSG = `Timeout after ${SEARCH_WAIT_TIMEOUT / 1000} seconds.`;
|
|
const DOWNLOAD_TIMEOUT = 20000;
|
|
const USER_AGENT =
|
|
"Mozilla/5.0 (X11; Linux x86_64; rv:127.0) Gecko/20100101 Firefox/127.0";
|
|
|
|
Date.prototype.isoDate = function () {
|
|
return `${this.getFullYear()}-${String(this.getMonth() + 1).padStart(2, "0")}-${String(this.getDate()).padStart(2, "0")}`;
|
|
};
|
|
|
|
Date.prototype.nzzDate = function () {
|
|
return `${String(this.getDate()).padStart(2, "0")}.${String(this.getMonth() + 1).padStart(2, "0")}.${this.getFullYear()}`;
|
|
};
|
|
|
|
Date.prototype.addDays = function (days) {
|
|
const date = new Date(this.valueOf());
|
|
date.setDate(date.getDate() + days);
|
|
return date;
|
|
};
|
|
|
|
function sleep(ms) {
|
|
return new Promise((resolve) => {
|
|
setTimeout(resolve, ms);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Wait for an in progress download to finish and move the file to the correct
|
|
* destination.
|
|
*
|
|
* @param {fs.PathLike} tmpDir Download directory.
|
|
* @param {fs.PathLike} outDir Final destination directory.
|
|
* @param {Date} date Date of the issue.
|
|
*/
|
|
function moveDownload(tmpDir, outDir, date) {
|
|
return new Promise((resolve) => {
|
|
const srcFile = path.join(
|
|
tmpDir,
|
|
`Gesamtausgabe_NZZ_-_Neue_Zürcher_Zeitung_${date.isoDate()}.pdf`,
|
|
);
|
|
|
|
if (!fs.existsSync(srcFile)) {
|
|
setTimeout(() => moveDownload(tmpDir, outDir, date), 2000);
|
|
}
|
|
|
|
const destFile = path.join(outDir, `${date.isoDate()}.pdf`);
|
|
try {
|
|
fs.copyFileSync(srcFile, destFile);
|
|
} catch {
|
|
// this means we tried to download a wrong issue
|
|
}
|
|
resolve();
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Enter the dates of the issue to download.
|
|
*
|
|
* @param {WebDriver} driver Selenium driver to use.
|
|
* @param {Date} date Date of the issue.
|
|
*/
|
|
async function enterDate(driver, date) {
|
|
const dateString = date.nzzDate();
|
|
|
|
const startDate = await driver.wait(
|
|
until.elementLocated(By.css("input.fup-s-date-start")),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await driver.wait(until.elementIsVisible(startDate), WAIT_TIMEOUT);
|
|
await driver.actions().scroll(0, 0, 0, 0, startDate).perform();
|
|
await startDate.clear();
|
|
await startDate.sendKeys(dateString);
|
|
|
|
const endDate = await driver.wait(
|
|
until.elementLocated(By.css("input.fup-s-date-end")),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await endDate.clear();
|
|
await endDate.sendKeys(dateString + Key.ENTER);
|
|
await sleep(500);
|
|
}
|
|
|
|
/**
|
|
* Login to the NZZ archive.
|
|
*
|
|
* @param {WebDriver} driver Selenium driver to use.
|
|
* @param {String} user Username for the login.
|
|
* @param {String} password Password for the user.
|
|
*/
|
|
async function login(driver, user, password) {
|
|
console.log("logging in...");
|
|
|
|
await driver.get(URL);
|
|
await sleep(500);
|
|
|
|
const loginButton = await driver.findElement(
|
|
By.css(".fup-menu-login-container"),
|
|
);
|
|
await driver.wait(until.elementIsVisible(loginButton), WAIT_TIMEOUT);
|
|
await loginButton.click();
|
|
|
|
const iframe = await driver.findElement(By.css('iframe[id^="piano"]'));
|
|
await driver.switchTo().frame(iframe);
|
|
|
|
const emailField = await driver.wait(
|
|
until.elementLocated(By.css('input[name="email"]')),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
|
|
await driver.wait(until.elementIsVisible(emailField), WAIT_TIMEOUT);
|
|
await emailField.sendKeys(user);
|
|
|
|
const pwField = await driver.wait(
|
|
until.elementLocated(By.css('input[type="password"]')),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await pwField.sendKeys(password);
|
|
|
|
const submitButton = await driver.wait(
|
|
until.elementLocated(By.css('button[class="btn prime"]')),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await driver.wait(until.elementIsVisible(submitButton), WAIT_TIMEOUT);
|
|
await submitButton.click();
|
|
await driver.switchTo().defaultContent();
|
|
await sleep(500);
|
|
|
|
const loginMenu = await driver.wait(
|
|
until.elementLocated(
|
|
By.css(".fup-login-open.fup-button.fup-s-menu-login-open"),
|
|
),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await driver.wait(until.elementIsVisible(loginMenu), WAIT_TIMEOUT);
|
|
}
|
|
|
|
/**
|
|
* Start the download of a full issue.
|
|
*
|
|
* @param {WebDriver} driver Selenium driver to use.
|
|
*/
|
|
async function download(driver) {
|
|
const menu = await driver.wait(
|
|
until.elementLocated(By.css(".fup-menu-item-download")),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await menu.click();
|
|
|
|
const download = await driver.wait(
|
|
until.elementLocated(By.css(".fup-s-menu-download-edition-confirmation")),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await download.click();
|
|
|
|
const loadingMask = await driver.wait(
|
|
until.elementLocated(By.css(".fup-loading-mask")),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await driver.wait(until.elementIsVisible(loadingMask), DOWNLOAD_TIMEOUT);
|
|
await driver.wait(until.stalenessOf(loadingMask), DOWNLOAD_TIMEOUT);
|
|
|
|
const back = await driver.wait(
|
|
until.elementLocated(By.css(".fup-s-menu-back")),
|
|
WAIT_TIMEOUT,
|
|
TIMEOUT_MSG,
|
|
);
|
|
await driver.wait(until.elementIsVisible(back), WAIT_TIMEOUT);
|
|
// back.click();
|
|
await sleep(500);
|
|
}
|
|
|
|
/**
|
|
* Download all issues in a certain time span.
|
|
*
|
|
* @param {WebDriver} driver Selenium driver to use.
|
|
* @param {Date} from Earliest issue to download.
|
|
* @param {Date} to Latest issue to download.
|
|
* @param {fs.PathLike} tmpDir Download directory.
|
|
* @param {fs.PathLike} outDir Final destination directory.
|
|
*/
|
|
async function findIssues(driver, from, to, tmpDir, outDir) {
|
|
from = from.addDays(-1);
|
|
|
|
while (from.toDateString() !== to.toDateString()) {
|
|
from = from.addDays(1);
|
|
console.log(`checking ${from.isoDate()}...`);
|
|
|
|
await enterDate(driver, from);
|
|
|
|
try {
|
|
const articles = await driver.wait(
|
|
until.elementsLocated(By.css(".fup-archive-result-item-article-title")),
|
|
SEARCH_WAIT_TIMEOUT,
|
|
SEARCH_TIMEOUT_MSG,
|
|
);
|
|
|
|
await articles[0].click();
|
|
await sleep(500);
|
|
|
|
await download(driver);
|
|
console.log(`\tdownloading...`);
|
|
await sleep(500);
|
|
|
|
// do this in the background
|
|
moveDownload(tmpDir, outDir, from);
|
|
|
|
await driver.get(URL);
|
|
} catch {
|
|
// this means there is no issue on the searched date
|
|
// move along with the next date
|
|
console.log(`\tno issues`);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Setup the headless browser and download all issues in the specified time span.
|
|
*
|
|
* @param {Date} from Earliest issue to download.
|
|
* @param {Date} to Latest issue to download.
|
|
* @param {String} user Username for the nzz archive.
|
|
* @param {String} password Password for the user.
|
|
* @param {fs.PathLike} outDir Final destination directory.
|
|
*/
|
|
async function run(from, to, user, password, outDir) {
|
|
if (!fs.existsSync(outDir)) {
|
|
fs.mkdirSync(outDir);
|
|
}
|
|
|
|
const tmpDir = tmp.dirSync();
|
|
console.log(`downloading to ${outDir} (tmp dir: ${tmpDir.name})...`);
|
|
|
|
const fxOptions = new fx.Options()
|
|
.addArguments("-headless")
|
|
.setPreference("pdfjs.disabled", true)
|
|
.setPreference("general.useragent.override", USER_AGENT)
|
|
.setPreference("browser.helperApps.neverAsk.openFile", "application/pdf")
|
|
.setPreference("browser.download.folderList", 2)
|
|
.setPreference("browser.download.manager.showWhenStartingout", false)
|
|
.setPreference("browser.download.dir", tmpDir.name)
|
|
.setPreference("browser.helperApps.neverAsk.saveToDisk", "application/pdf");
|
|
|
|
const caps = new Capabilities();
|
|
caps.setPageLoadStrategy("normal");
|
|
|
|
const driver = await new Builder()
|
|
.withCapabilities(caps)
|
|
.forBrowser("firefox")
|
|
.setFirefoxOptions(fxOptions)
|
|
.build();
|
|
|
|
try {
|
|
await login(driver, user, password);
|
|
await findIssues(driver, from, to, tmpDir.name, outDir);
|
|
await sleep(1000);
|
|
} finally {
|
|
await fs.rm(tmpDir.name, { recursive: true }, (e) => {
|
|
if (e) {
|
|
console.error(`failed to remove tmp directory: ${e}`);
|
|
}
|
|
});
|
|
await driver.quit();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse arguments and start the downloading of the issues.
|
|
*/
|
|
(function init() {
|
|
const now = new Date();
|
|
const nowString = now.isoDate();
|
|
|
|
const argv = yargs(hideBin(process.argv))
|
|
.usage(
|
|
"Usage: $0 -f [date] -t [date] -o [path] -u [usernane] -p [password]",
|
|
)
|
|
.demandOption(["u", "p"])
|
|
.help("h")
|
|
.describe("f", "Earliest issue to download.")
|
|
.describe("t", "Latest issue to download.")
|
|
.describe("o", "Download directory.")
|
|
.describe("u", "Username for the nzz archive.")
|
|
.describe("p", "Password for the user.")
|
|
.alias("h", "help")
|
|
.alias("f", "from")
|
|
.alias("t", "to")
|
|
.alias("o", "out")
|
|
.alias("u", "user")
|
|
.alias("p", "password")
|
|
.default("f", nowString)
|
|
.default("t", nowString)
|
|
.default("o", "./nzz")
|
|
.epilog("Copyright (c) Sebastian Hugentobler <shu@vanwa.ch> 2024")
|
|
.example(
|
|
"$0 -u 'myuser@example.com' -p 'mypassword' -f 1780-01-01 -t 1780-02-30",
|
|
'Download all existing issues from 01-01-1780 until 30-02-1780 to the default directory "./nzz"',
|
|
).argv;
|
|
|
|
const from = new Date(argv.from);
|
|
const to = new Date(argv.to);
|
|
|
|
if (from > to) {
|
|
console.error('"from" date must be before "to" date');
|
|
process.exit(1);
|
|
}
|
|
|
|
run(from, to, argv.user, argv.password, argv.out);
|
|
})();
|