2020-12-23 21:21:47 +00:00
#!/usr/bin/env node
/ * C o p y r i g h t ( c ) S e b a s t i a n H u g e n t o b l e r < s e b a s t i a n @ v a n w a . c h > 2 0 2 0 .
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License , v . 2.0 . If a copy of the MPL was not distributed with this
* file , You can obtain one at https : //mozilla.org/MPL/2.0/. */
const tmp = require ( 'tmp' ) ;
var path = require ( 'path' ) ;
const fs = require ( 'fs' ) ;
const { Builder , By , Key , until } = require ( 'selenium-webdriver' ) ;
const fx = require ( 'selenium-webdriver/firefox' ) ;
const yargs = require ( 'yargs/yargs' )
const { hideBin } = require ( 'yargs/helpers' ) ;
const URL = 'https://zeitungsarchiv.nzz.ch/' ;
const RETRY _DELAY = 2000 ;
const WAIT _TIMEOUT = 2000 ;
const TIMEOUT _MSG = ` Timeout after ${ WAIT _TIMEOUT / 1000 } seconds. ` ;
const SEARCH _WAIT _TIMEOUT = 5000 ;
const SEARCH _TIMEOUT _MSG = ` Timeout after ${ SEARCH _WAIT _TIMEOUT / 1000 } seconds. ` ;
Date . prototype . isoDate = function ( ) {
return ` ${ this . getFullYear ( ) } - ${ String ( this . getMonth ( ) + 1 ) . padStart ( 2 , '0' ) } - ${ String ( this . getDate ( ) ) . padStart ( 2 , '0' ) } ` ;
}
Date . prototype . nzzDate = function ( ) {
return ` ${ String ( this . getDate ( ) ) . padStart ( 2 , '0' ) } . ${ String ( this . getMonth ( ) + 1 ) . padStart ( 2 , '0' ) } . ${ this . getFullYear ( ) } ` ;
}
Date . prototype . addDays = function ( days ) {
let date = new Date ( this . valueOf ( ) ) ;
date . setDate ( date . getDate ( ) + days ) ;
return date ;
}
function sleep ( ms ) {
return new Promise ( ( resolve ) => {
setTimeout ( resolve , ms ) ;
} ) ;
}
/ * *
* Wait for an in progress download to finish and move the file to the correct
* destination .
*
* @ param { fs . PathLike } tmpDir Download directory .
* @ param { fs . PathLike } outDir Final destination directory .
* @ param { Date } date Date of the issue .
* /
async function moveDownload ( tmpDir , outDir , date ) {
return new Promise ( resolve => {
let needsOffset = false ;
// Dates in filenames are off by one before 1894
if ( date . getFullYear ( ) < 1894 ) {
date = date . addDays ( - 1 ) ;
needsOffset = true ;
}
let srcFile = path . join ( tmpDir , ` Gesamtausgabe_NZZ_-_Neue_Zürcher_Zeitung_ ${ date . isoDate ( ) } .pdf ` ) ;
fs . watchFile ( srcFile , ( ) => {
if ( fs . existsSync ( srcFile ) ) {
fs . unwatchFile ( srcFile ) ;
if ( needsOffset ) {
date = date . addDays ( 1 ) ;
}
let destFile = path . join ( outDir , ` ${ date . isoDate ( ) } .pdf ` ) ;
fs . copyFileSync ( srcFile , destFile ) ;
resolve ( ) ;
}
} ) ;
} ) ;
}
/ * *
* Enter the dates of the issue to download .
*
* @ param { WebDriver } driver Selenium driver to use .
* @ param { Date } date Date of the issue .
* /
async function enterDate ( driver , date ) {
let dateString = date . nzzDate ( ) ;
let startDate = await driver . wait ( until . elementLocated ( By . css ( '.fup-s-date-start' ) ) , WAIT _TIMEOUT , TIMEOUT _MSG , RETRY _DELAY ) ;
await startDate . clear ( ) ;
await startDate . sendKeys ( dateString ) ;
let endDate = await driver . wait ( until . elementLocated ( By . css ( '.fup-s-date-end' ) ) , WAIT _TIMEOUT , TIMEOUT _MSG , RETRY _DELAY ) ;
await endDate . clear ( ) ;
await endDate . sendKeys ( dateString + Key . ENTER ) ;
}
/ * *
* Login to the NZZ archive .
*
* @ param { WebDriver } driver Selenium driver to use .
* @ param { String } user Username for the login .
* @ param { String } password Password for the user .
* /
async function login ( driver , user , password ) {
console . log ( 'logging in...' ) ;
await driver . get ( URL ) ;
await sleep ( 500 ) ;
let loginButton = await driver . findElement ( By . css ( '.fup-menu-login-container' ) ) ;
await driver . wait ( until . elementIsVisible ( loginButton ) , WAIT _TIMEOUT ) ;
await sleep ( 1000 ) ;
await loginButton . click ( ) ;
let emailField = await driver . wait ( until . elementLocated ( By . id ( 'c1-login-field' ) ) , WAIT _TIMEOUT , TIMEOUT _MSG , RETRY _DELAY ) ;
await emailField . sendKeys ( user + Key . ENTER ) ;
let pwField = await driver . wait ( until . elementLocated ( By . id ( 'c1-password-field' ) ) , WAIT _TIMEOUT , TIMEOUT _MSG , RETRY _DELAY ) ;
await pwField . sendKeys ( password + Key . ENTER ) ;
}
/ * *
* Start the downloa of a full issue .
*
* @ param { WebDriver } driver Selenium driver to use .
* /
async function download ( driver ) {
let menu = await driver . wait ( until . elementLocated ( By . css ( '.fup-menu-item-download' ) ) , WAIT _TIMEOUT , TIMEOUT _MSG , RETRY _DELAY ) ;
await menu . click ( ) ;
let download = await driver . wait ( until . elementLocated ( By . css ( '.fup-s-menu-download-edition' ) ) , WAIT _TIMEOUT , TIMEOUT _MSG , RETRY _DELAY ) ;
await download . click ( ) ;
let back = await driver . wait ( until . elementLocated ( By . css ( '.fup-s-menu-back' ) ) , WAIT _TIMEOUT , TIMEOUT _MSG , RETRY _DELAY ) ;
await driver . wait ( until . elementIsVisible ( back ) , WAIT _TIMEOUT ) ;
await sleep ( 500 )
}
/ * *
* Download all issues in a certain time span .
*
* @ param { WebDriver } driver Selenium driver to use .
* @ param { Date } from Earliest issue to download .
* @ param { Date } to Latest issue to download .
* @ param { fs . PathLike } tmpDir Download directory .
* @ param { fs . PathLike } outDir Final destination directory .
* /
async function findIssues ( driver , from , to , tmpDir , outDir ) {
from = from . addDays ( - 1 ) ;
while ( from . toDateString ( ) !== to . toDateString ( ) ) {
from = from . addDays ( 1 ) ;
await enterDate ( driver , from ) ;
try {
let articles = await driver . wait ( until . elementsLocated ( By . css ( '.fup-archive-result-item-article-title' ) ) , SEARCH _WAIT _TIMEOUT , SEARCH _TIMEOUT _MSG , RETRY _DELAY ) ;
2020-12-23 21:34:53 +00:00
await articles [ 0 ] . click ( ) ; s
2020-12-23 21:21:47 +00:00
await download ( driver ) ;
console . log ( ` downloading ${ from . isoDate ( ) } ... ` ) ;
// do this in the background
moveDownload ( tmpDir , outDir , from ) ;
await driver . get ( URL ) ;
} catch {
// this means there is no issue on the searched date
// move along with the next date
continue ;
}
}
}
/ * *
* Setup the headless browser and download all issues in the specified time span .
*
* @ param { Date } from Earliest issue to download .
* @ param { Date } to Latest issue to download .
* @ param { String } user Username for the nzz archive .
* @ param { String } password Password for the user .
* @ param { fs . PathLike } outDir Final destination directory .
* /
async function run ( from , to , user , password , outDir ) {
if ( ! fs . existsSync ( outDir ) ) {
fs . mkdirSync ( outDir ) ;
}
let tmpDir = tmp . dirSync ( ) ;
console . log ( ` downloading to ${ outDir } (tmp dir: ${ tmpDir . name } )... ` ) ;
let fxOptions = new fx . Options ( )
. addArguments ( '-headless' )
. setPreference ( 'pdfjs.disabled' , true )
. setPreference ( 'browser.helperApps.neverAsk.openFile' , 'application/pdf' )
. setPreference ( 'browser.download.folderList' , 2 )
. setPreference ( 'browser.download.manager.showWhenStartingout' , false )
. setPreference ( 'browser.download.dir' , tmpDir . name )
. setPreference ( 'browser.helperApps.neverAsk.saveToDisk' , 'application/pdf' ) ;
let driver = await new Builder ( )
. forBrowser ( 'firefox' )
. setFirefoxOptions ( fxOptions )
. build ( ) ;
try {
await login ( driver , user , password ) ;
await findIssues ( driver , from , to , tmpDir . name , outDir ) ;
await sleep ( 1000 ) ;
}
finally {
driver . quit ( ) ;
fs . rmdirSync ( tmpDir . name , { recursive : true } ) ;
}
}
/ * *
* Parse arguments and start the downloading off the issues .
* /
( async function init ( ) {
let now = new Date ( ) ;
let nowString = now . isoDate ( ) ;
const argv = yargs ( hideBin ( process . argv ) )
. usage ( 'Usage: $0 -f [date] -t [date] -o [path] -u [usernane] -p [password]' )
. demandOption ( [ 'u' , 'p' ] )
. help ( 'h' )
. describe ( 'f' , 'Earliest issue to download.' )
. describe ( 't' , 'Latest issue to download.' )
. describe ( 'o' , 'Download directory.' )
. describe ( 'u' , 'Username for the nzz archive.' )
. describe ( 'p' , 'Password for the user.' )
. alias ( 'h' , 'help' )
. alias ( 'f' , 'from' )
. alias ( 't' , 'to' )
. alias ( 'o' , 'out' )
. alias ( 'u' , 'user' )
. alias ( 'p' , 'password' )
. default ( 'f' , nowString )
. default ( 't' , nowString )
. default ( 'o' , './nzz' )
. epilog ( 'Copyright (c) Sebastian Hugentobler <sebastian@vanwa.ch> 2020' )
. example ( "$0 -u 'myuser@example.com' -p 'mypassword' -f 1780-01-01 -t 1780-02-30" , 'Download all existing issues from 01-01-1780 until 30-02-1780 to the default directory "./nzz"' )
. argv ;
let from = new Date ( argv . from ) ;
let to = new Date ( argv . to ) ;
if ( from > to ) {
console . error ( '"from" date must be before "to" date' ) ;
process . exit ( 1 ) ;
}
run ( from , to , argv . user , argv . password , argv . out )
} ) ( ) ;