rewrite so webdriver is only used for the login

2024-07-03 12:32:47 +02:00 · 2024-07-03 12:32:47 +02:00 · cf8fbe0965
commit cf8fbe0965
parent 3edb6a7671
26 changed files with 5385 additions and 1296 deletions
--- a/download/Cargo.lock
+++ b/download/Cargo.lock
--- a/download/Cargo.toml
+++ b/download/Cargo.toml
@ -0,0 +1,20 @@
+[package]
+name = "nzz-download"
+version = "0.1.0"
+edition = "2021"
+license =  { workspace = true }
+authors =  { workspace = true }
+repository =  { workspace = true }
+
+[dependencies]
+anyhow = { workspace = true }
+clap = { workspace = true }
+lopdf = "0.32.0"
+reqwest = { version = "0.12.5", features = ["json"] }
+serde = { version = "1.0.203", features = ["derive"] }
+serde_json = { workspace = true }
+tempfile = "3.10.1"
+tokio = { workspace = true }
+time = { version = "0.3.36", features = ["macros", "serde", "formatting", "parsing" ] }
+tracing = "0.1.40"
+tracing-subscriber = "0.3.18"
--- a/download/src/cli.rs
+++ b/download/src/cli.rs
@ -0,0 +1,31 @@
+//! Cli interface.
+
+use std::path::PathBuf;
+
+use clap::Parser;
+use time::error::Parse;
+use time::Date;
+
+use crate::date::FORMAT;
+
+/// Parse a date prvided a a cli argument.
+fn parse_date(input: &str) -> Result<Date, Parse> {
+    Date::parse(input, FORMAT)
+}
+
+/// Download issues of the NZZ newspaper
+#[derive(Parser)]
+#[command(version, about, long_about = None, after_help = "Provide the authentication cookie from stdin.")]
+pub struct Config {
+    /// Earliest issue to download (like 1780-12-31)
+    #[arg(short, long, env, value_parser=parse_date)]
+    pub from: Date,
+
+    /// Latest issue to download (like 1780-12-31)
+    #[arg(short, long, env, value_parser=parse_date)]
+    pub to: Date,
+
+    /// Output directory.
+    #[arg(short, long, env, default_value = "./nzz")]
+    pub output_dir: PathBuf,
+}
--- a/download/src/date.rs
+++ b/download/src/date.rs
@ -0,0 +1,27 @@
+//! Utilities for handling dates.
+
+use serde::{Deserialize, Deserializer, Serializer};
+use time::format_description::FormatItem;
+use time::macros::format_description;
+use time::Date;
+
+/// Date format for newspaper issues (YYYY-mm-dd)
+pub const FORMAT: &[FormatItem<'_>] = format_description!("[year]-[month]-[day]");
+
+/// Serialize a date to a String with serde.
+pub fn serialize<S>(value: &Date, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: Serializer,
+{
+    let formated = value.format(&FORMAT).unwrap();
+    serializer.serialize_str(&formated)
+}
+
+/// Deserialize a String to a Date with serde.
+pub fn deserialize<'de, D>(deserializer: D) -> Result<Date, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let s: &str = Deserialize::deserialize(deserializer)?;
+    Date::parse(s, FORMAT).map_err(serde::de::Error::custom)
+}
--- a/download/src/download.rs
+++ b/download/src/download.rs
@ -0,0 +1,52 @@
+//! Handle downloads of newspaper issues.
+
+use std::{
+    fs::{self},
+    io::{Cursor, Read},
+    path::Path,
+};
+
+use anyhow::Result;
+use tracing::{debug, info};
+
+use crate::{nzz::Issue, pdf};
+
+/// Download all pages of the provided `issues` and save them merged to the directory `output_dir`.
+///
+/// Create `output_dir` if it does not exist.
+pub async fn fetch(issues: Vec<Issue>, output_dir: &Path) -> Result<()> {
+    debug!("ensuring {output_dir:?} exists");
+    fs::create_dir_all(output_dir)?;
+
+    for issue in issues {
+        info!("saving issue {}", issue.publication_date);
+
+        let tmp_dir = tempfile::tempdir()?;
+        let mut pages = Vec::new();
+        for (i, page) in issue.pages.into_iter().enumerate() {
+            debug!(
+                "fetching issue {}, page {}: {page}",
+                issue.publication_date,
+                i + 1
+            );
+
+            let response = reqwest::Client::new().get(page).send().await?;
+            let mut content = Cursor::new(response.bytes().await?);
+            let mut page_data = Vec::new();
+            content.read_to_end(&mut page_data)?;
+
+            let tmp_page = tmp_dir.path().join(i.to_string());
+            fs::write(&tmp_page, page_data)?;
+            pages.push(tmp_page);
+        }
+
+        let issue_name = format!("nzz_{}.pdf", issue.publication_date);
+        let issue_path = output_dir.join(issue_name);
+        let issue_title = format!("NZZ {}", issue.publication_date);
+
+        pdf::merge(pages, &issue_path, &issue_title)?;
+        debug!("issue {} saved", issue.publication_date);
+    }
+
+    Ok(())
+}
--- a/download/src/lib.rs
+++ b/download/src/lib.rs
@ -0,0 +1,19 @@
+//! A small utility to download issues of the NZZ newspaper.
+
+use anyhow::Result;
+
+use cli::Config;
+
+pub mod cli;
+pub mod date;
+pub mod download;
+pub mod nzz;
+pub mod pdf;
+
+/// Entry point to download nzz issues.
+pub async fn run(args: Config, cookie: &str) -> Result<()> {
+    let issues = nzz::fetch(cookie, args.from, args.to).await?;
+    download::fetch(issues, &args.output_dir).await?;
+
+    Ok(())
+}
--- a/download/src/main.rs
+++ b/download/src/main.rs
@ -0,0 +1,29 @@
+use std::io::{self, Read};
+
+use anyhow::Result;
+use clap::Parser;
+use nzz_download::cli::Config;
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    if std::env::var_os("RUST_LOG").is_none() {
+        std::env::set_var("RUST_LOG", "info");
+    }
+
+    tracing_subscriber::fmt::init();
+
+    let args = Config::parse();
+    let cookie = read_cookie().expect("Provide the authentication cookie via stdin");
+
+    nzz_download::run(args, &cookie).await
+}
+
+/// read authentication cookie from stdin.
+fn read_cookie() -> Result<String> {
+    let stdin = io::stdin();
+    let mut buffer = String::new();
+
+    stdin.lock().read_to_string(&mut buffer)?;
+    let cookie = buffer.trim();
+    Ok(cookie.to_string())
+}
--- a/download/src/nzz.rs
+++ b/download/src/nzz.rs
@ -0,0 +1,198 @@
+//! Handle information relating to NZZ issues.
+
+use anyhow::Result;
+use serde::{Deserialize, Serialize};
+use time::Date;
+use tracing::info;
+
+const SEARCH_URL: &str = "https://zeitungsarchiv.nzz.ch/solr-epaper-search/1.0/search";
+const ISSUE_URL: &str = "https://zeitungsarchiv.nzz.ch/archive/1.0/getPages";
+
+#[derive(Debug, Serialize, Deserialize)]
+struct SearchData {
+    query: String,
+    offset: u32,
+    #[serde(rename = "sortField")]
+    sort_field: String,
+    #[serde(rename = "sortOrder")]
+    sort_order: String,
+    #[serde(
+        rename = "startDate",
+        serialize_with = "crate::date::serialize",
+        deserialize_with = "crate::date::deserialize"
+    )]
+    start_date: Date,
+    #[serde(
+        rename = "endDate",
+        serialize_with = "crate::date::serialize",
+        deserialize_with = "crate::date::deserialize"
+    )]
+    end_date: Date,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct SearchResult {
+    data: SearchInfo,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct SearchInfo {
+    total: u32,
+    offset: u32,
+    #[serde(rename = "pageSize")]
+    page_size: u32,
+    #[serde(rename = "resData")]
+    res_data: Vec<IssueData>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct IssueData {
+    #[serde(rename = "editionId")]
+    edition_id: u32,
+    #[serde(rename = "pageNumber")]
+    page_nr: u32,
+    #[serde(
+        rename = "publicationDate",
+        deserialize_with = "crate::date::deserialize"
+    )]
+    publication_date: Date,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct PagesResult {
+    data: PagesInfo,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct PagesInfo {
+    pages: Vec<Page>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct Page {
+    #[serde(rename = "pmPageNumber")]
+    page_nr: u32,
+    #[serde(rename = "pageDocUrl")]
+    doc: PageDoc,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct PageDoc {
+    #[serde(rename = "HIGHRES")]
+    link: PageHighRes,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+struct PageHighRes {
+    url: String,
+}
+
+/// A single NZZ issue.
+#[derive(Debug, Clone)]
+pub struct Issue {
+    /// Date of publication.
+    pub publication_date: Date,
+    /// ordered vector of page urls in the issue.
+    pub pages: Vec<String>,
+}
+
+impl SearchData {
+    pub fn new(offset: u32, start_date: Date, end_date: Date) -> Self {
+        Self {
+            query: "".to_string(),
+            offset,
+            sort_field: "media_ts".to_string(),
+            sort_order: "desc".to_string(),
+            start_date,
+            end_date,
+        }
+    }
+}
+
+/// Search all issues between `from` and `to` (inclusive) using an `offset` into the results.
+async fn offset_search(offset: u32, cookie: &str, from: Date, to: Date) -> Result<SearchInfo> {
+    let data = SearchData::new(offset, from, to);
+    let result: SearchResult = reqwest::Client::new()
+        .post(SEARCH_URL)
+        .header("Cookie", cookie)
+        .json(&data)
+        .send()
+        .await?
+        .json()
+        .await?;
+    Ok(result.data)
+}
+
+/// Only keep first pages, they are enough to get the edition id.
+fn filter_issues(unfiltered_issues: Vec<IssueData>) -> Vec<IssueData> {
+    unfiltered_issues
+        .into_iter()
+        .filter(|info| info.page_nr == 1)
+        .collect()
+}
+
+/// Search all issues between `from` and `to` (inclusive) respecting pagination.
+async fn search(cookie: &str, from: Date, to: Date) -> Result<Vec<IssueData>> {
+    info!("looking for issues between {from} to {to}");
+    let mut result = offset_search(0, cookie, from, to).await?;
+    let mut issues: Vec<IssueData> = filter_issues(result.res_data);
+
+    while result.offset + result.page_size < result.total {
+        result = offset_search(result.offset + result.page_size, cookie, from, to).await?;
+        issues.extend(filter_issues(result.res_data));
+    }
+
+    Ok(issues)
+}
+
+/// Fetch all page urls for the issue with eddition id `edition_id` and order them by page number.
+async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
+    let result: PagesResult = reqwest::Client::new()
+        .post(ISSUE_URL)
+        .header("Cookie", cookie)
+        .json(&serde_json::json!({
+            "editionId": edition_id,
+        }))
+        .send()
+        .await?
+        .json()
+        .await?;
+
+    let mut pages: Vec<(u32, String)> = result
+        .data
+        .pages
+        .into_iter()
+        .map(|page| (page.page_nr, page.doc.link.url))
+        .collect();
+    pages.sort_by(|a, b| a.0.cmp(&b.0));
+    let pages = pages.into_iter().map(|page| page.1).collect();
+
+    Ok(pages)
+}
+
+/// Fetch all page urls for `issues`.
+async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
+    let mut hydrated_issues = Vec::new();
+    for issue in issues {
+        info!(
+            "fetching page information for issue {}",
+            issue.publication_date
+        );
+        let pages = build_pages(cookie, issue.edition_id).await?;
+        hydrated_issues.push(Issue {
+            publication_date: issue.publication_date,
+            pages,
+        });
+    }
+
+    Ok(hydrated_issues)
+}
+
+/// Fetch issue information in the date range `from`- `to` (inclusive) using `cookie`for
+/// authentication.
+pub async fn fetch(cookie: &str, from: Date, to: Date) -> Result<Vec<Issue>> {
+    let issues = search(cookie, from, to).await?;
+    let issues = build_issues(cookie, issues).await?;
+
+    Ok(issues)
+}
--- a/download/src/pdf.rs
+++ b/download/src/pdf.rs
@ -0,0 +1,177 @@
+//! Manipulate pdf documents.
+
+use std::{
+    collections::BTreeMap,
+    path::{Path, PathBuf},
+};
+
+use anyhow::Result;
+use lopdf::{Dictionary, Document, Object, ObjectId};
+
+const METADATA_TITLE: &str = "Title";
+const METADATA_PRODUCER: &str = "Producer";
+const PDF_VERSION: &str = "1.8";
+const PRODUCER: &str = "NZZ Downloader";
+
+/// Merge the provided pdfs in the `input` vector to one pdf in `out`, setting its title to
+/// `title`.
+///
+/// The code is from https://github.com/J-F-Liu/lopdf/blob/6b04581640e061bfeb39b585e50a7e9d102b8fe2/examples/merge.rs
+/// with some modifications. I have no clue about PDF structure and this is still a bit of a
+/// mistery to me.
+pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
+    let mut max_id = 1;
+    let mut documents_pages = BTreeMap::new();
+    let mut documents_objects = BTreeMap::new();
+    let mut merged_doc = Document::with_version(PDF_VERSION);
+
+    for pdf in input {
+        let mut doc = Document::load(pdf)?;
+
+        doc.renumber_objects_with(max_id);
+
+        max_id = doc.max_id + 1;
+
+        documents_pages.extend(
+            doc.get_pages()
+                .into_values()
+                .map(|object_id| (object_id, doc.get_object(object_id).unwrap().to_owned()))
+                .collect::<BTreeMap<ObjectId, Object>>(),
+        );
+        documents_objects.extend(doc.objects);
+    }
+
+    let mut catalog_object: Option<(ObjectId, Object)> = None;
+    let mut pages_object: Option<(ObjectId, Object)> = None;
+
+    // Process all objects except "Page" type
+    for (object_id, object) in documents_objects.iter() {
+        // We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
+        // All other objects should be collected and inserted into the main Document
+        match object.type_name().unwrap_or("") {
+            "Catalog" => {
+                // Collect a first "Catalog" object and use it for the future "Pages"
+                catalog_object = Some((
+                    if let Some((id, _)) = catalog_object {
+                        id
+                    } else {
+                        *object_id
+                    },
+                    object.clone(),
+                ));
+            }
+            "Pages" => {
+                // Collect and update a first "Pages" object and use it for the future "Catalog"
+                // We have also to merge all dictionaries of the old and the new "Pages" object
+                if let Ok(dictionary) = object.as_dict() {
+                    let mut dictionary = dictionary.clone();
+                    if let Some((_, ref object)) = pages_object {
+                        if let Ok(old_dictionary) = object.as_dict() {
+                            dictionary.extend(old_dictionary);
+                        }
+                    }
+
+                    pages_object = Some((
+                        if let Some((id, _)) = pages_object {
+                            id
+                        } else {
+                            *object_id
+                        },
+                        Object::Dictionary(dictionary),
+                    ));
+                }
+            }
+            "Page" => {}     // Ignored, processed later and separately
+            "Outlines" => {} // Ignored, not supported yet
+            "Outline" => {}  // Ignored, not supported yet
+            _ => {
+                merged_doc.max_id += 1;
+                merged_doc.objects.insert(*object_id, object.clone());
+            }
+        }
+    }
+
+    for (object_id, object) in documents_pages.iter() {
+        if let Ok(dictionary) = object.as_dict() {
+            let mut dictionary = dictionary.clone();
+            dictionary.set("Parent", pages_object.as_ref().unwrap().0);
+
+            merged_doc
+                .objects
+                .insert(*object_id, Object::Dictionary(dictionary));
+        }
+    }
+
+    let catalog_object = catalog_object.unwrap();
+    let pages_object = pages_object.unwrap();
+
+    // Build a new "Pages" with updated fields
+    if let Ok(dictionary) = pages_object.1.as_dict() {
+        let mut dictionary = dictionary.clone();
+
+        // Set new pages count
+        dictionary.set("Count", documents_pages.len() as u32);
+
+        // Set new "Kids" list (collected from documents pages) for "Pages"
+        dictionary.set(
+            "Kids",
+            documents_pages
+                .into_keys()
+                .map(Object::Reference)
+                .collect::<Vec<_>>(),
+        );
+
+        merged_doc
+            .objects
+            .insert(pages_object.0, Object::Dictionary(dictionary));
+    }
+
+    // Build a new "Catalog" with updated fields
+    if let Ok(dictionary) = catalog_object.1.as_dict() {
+        let mut dictionary = dictionary.clone();
+        dictionary.set("Pages", pages_object.0);
+        dictionary.remove(b"Outlines"); // Outlines not supported in merged PDFs
+
+        merged_doc
+            .objects
+            .insert(catalog_object.0, Object::Dictionary(dictionary));
+    }
+
+    merged_doc.trailer.set("Root", catalog_object.0);
+
+    set_metadata(METADATA_TITLE, title, &mut merged_doc);
+    set_metadata(METADATA_PRODUCER, PRODUCER, &mut merged_doc);
+
+    // Update the max internal ID as wasn't updated before due to direct objects insertion
+    merged_doc.max_id = merged_doc.objects.len() as u32;
+
+    // Reorder all new Document objects
+    merged_doc.renumber_objects();
+
+    merged_doc.compress();
+    merged_doc.save(out)?;
+
+    Ok(())
+}
+
+/// Set metadata `key` to `value`.
+///
+/// Add the `Info trailer to the pdf document if it does not yet exist.`
+fn set_metadata(key: &str, value: &str, doc: &mut Document) {
+    let info_dict_id = match doc.trailer.get(b"Info") {
+        Ok(&Object::Reference(id)) => id,
+        _ => {
+            // without this the following add_object call overwrites an existing
+            // object at max_id
+            doc.max_id += 1;
+
+            let id = doc.add_object(Dictionary::new());
+            doc.trailer.set("Info", Object::Reference(id));
+            id
+        }
+    };
+
+    if let Some(Object::Dictionary(ref mut info_dict)) = doc.objects.get_mut(&info_dict_id) {
+        info_dict.set(key, Object::string_literal(value));
+    }
+}