rewrite so webdriver is only used for the login

This commit is contained in:
Sebastian Hugentobler 2024-07-03 12:32:47 +02:00
parent 3edb6a7671
commit cf8fbe0965
Signed by: shu
GPG key ID: BB32CF3CA052C2F0
26 changed files with 5385 additions and 1296 deletions

1859
download/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

20
download/Cargo.toml Normal file
View file

@ -0,0 +1,20 @@
[package]
name = "nzz-download"
version = "0.1.0"
edition = "2021"
license = { workspace = true }
authors = { workspace = true }
repository = { workspace = true }
[dependencies]
anyhow = { workspace = true }
clap = { workspace = true }
lopdf = "0.32.0"
reqwest = { version = "0.12.5", features = ["json"] }
serde = { version = "1.0.203", features = ["derive"] }
serde_json = { workspace = true }
tempfile = "3.10.1"
tokio = { workspace = true }
time = { version = "0.3.36", features = ["macros", "serde", "formatting", "parsing" ] }
tracing = "0.1.40"
tracing-subscriber = "0.3.18"

31
download/src/cli.rs Normal file
View file

@ -0,0 +1,31 @@
//! Cli interface.
use std::path::PathBuf;
use clap::Parser;
use time::error::Parse;
use time::Date;
use crate::date::FORMAT;
/// Parse a date prvided a a cli argument.
fn parse_date(input: &str) -> Result<Date, Parse> {
Date::parse(input, FORMAT)
}
/// Download issues of the NZZ newspaper
#[derive(Parser)]
#[command(version, about, long_about = None, after_help = "Provide the authentication cookie from stdin.")]
pub struct Config {
/// Earliest issue to download (like 1780-12-31)
#[arg(short, long, env, value_parser=parse_date)]
pub from: Date,
/// Latest issue to download (like 1780-12-31)
#[arg(short, long, env, value_parser=parse_date)]
pub to: Date,
/// Output directory.
#[arg(short, long, env, default_value = "./nzz")]
pub output_dir: PathBuf,
}

27
download/src/date.rs Normal file
View file

@ -0,0 +1,27 @@
//! Utilities for handling dates.
use serde::{Deserialize, Deserializer, Serializer};
use time::format_description::FormatItem;
use time::macros::format_description;
use time::Date;
/// Date format for newspaper issues (YYYY-mm-dd)
pub const FORMAT: &[FormatItem<'_>] = format_description!("[year]-[month]-[day]");
/// Serialize a date to a String with serde.
pub fn serialize<S>(value: &Date, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let formated = value.format(&FORMAT).unwrap();
serializer.serialize_str(&formated)
}
/// Deserialize a String to a Date with serde.
pub fn deserialize<'de, D>(deserializer: D) -> Result<Date, D::Error>
where
D: Deserializer<'de>,
{
let s: &str = Deserialize::deserialize(deserializer)?;
Date::parse(s, FORMAT).map_err(serde::de::Error::custom)
}

52
download/src/download.rs Normal file
View file

@ -0,0 +1,52 @@
//! Handle downloads of newspaper issues.
use std::{
fs::{self},
io::{Cursor, Read},
path::Path,
};
use anyhow::Result;
use tracing::{debug, info};
use crate::{nzz::Issue, pdf};
/// Download all pages of the provided `issues` and save them merged to the directory `output_dir`.
///
/// Create `output_dir` if it does not exist.
pub async fn fetch(issues: Vec<Issue>, output_dir: &Path) -> Result<()> {
debug!("ensuring {output_dir:?} exists");
fs::create_dir_all(output_dir)?;
for issue in issues {
info!("saving issue {}", issue.publication_date);
let tmp_dir = tempfile::tempdir()?;
let mut pages = Vec::new();
for (i, page) in issue.pages.into_iter().enumerate() {
debug!(
"fetching issue {}, page {}: {page}",
issue.publication_date,
i + 1
);
let response = reqwest::Client::new().get(page).send().await?;
let mut content = Cursor::new(response.bytes().await?);
let mut page_data = Vec::new();
content.read_to_end(&mut page_data)?;
let tmp_page = tmp_dir.path().join(i.to_string());
fs::write(&tmp_page, page_data)?;
pages.push(tmp_page);
}
let issue_name = format!("nzz_{}.pdf", issue.publication_date);
let issue_path = output_dir.join(issue_name);
let issue_title = format!("NZZ {}", issue.publication_date);
pdf::merge(pages, &issue_path, &issue_title)?;
debug!("issue {} saved", issue.publication_date);
}
Ok(())
}

19
download/src/lib.rs Normal file
View file

@ -0,0 +1,19 @@
//! A small utility to download issues of the NZZ newspaper.
use anyhow::Result;
use cli::Config;
pub mod cli;
pub mod date;
pub mod download;
pub mod nzz;
pub mod pdf;
/// Entry point to download nzz issues.
pub async fn run(args: Config, cookie: &str) -> Result<()> {
let issues = nzz::fetch(cookie, args.from, args.to).await?;
download::fetch(issues, &args.output_dir).await?;
Ok(())
}

29
download/src/main.rs Normal file
View file

@ -0,0 +1,29 @@
use std::io::{self, Read};
use anyhow::Result;
use clap::Parser;
use nzz_download::cli::Config;
#[tokio::main]
async fn main() -> Result<()> {
if std::env::var_os("RUST_LOG").is_none() {
std::env::set_var("RUST_LOG", "info");
}
tracing_subscriber::fmt::init();
let args = Config::parse();
let cookie = read_cookie().expect("Provide the authentication cookie via stdin");
nzz_download::run(args, &cookie).await
}
/// read authentication cookie from stdin.
fn read_cookie() -> Result<String> {
let stdin = io::stdin();
let mut buffer = String::new();
stdin.lock().read_to_string(&mut buffer)?;
let cookie = buffer.trim();
Ok(cookie.to_string())
}

198
download/src/nzz.rs Normal file
View file

@ -0,0 +1,198 @@
//! Handle information relating to NZZ issues.
use anyhow::Result;
use serde::{Deserialize, Serialize};
use time::Date;
use tracing::info;
const SEARCH_URL: &str = "https://zeitungsarchiv.nzz.ch/solr-epaper-search/1.0/search";
const ISSUE_URL: &str = "https://zeitungsarchiv.nzz.ch/archive/1.0/getPages";
#[derive(Debug, Serialize, Deserialize)]
struct SearchData {
query: String,
offset: u32,
#[serde(rename = "sortField")]
sort_field: String,
#[serde(rename = "sortOrder")]
sort_order: String,
#[serde(
rename = "startDate",
serialize_with = "crate::date::serialize",
deserialize_with = "crate::date::deserialize"
)]
start_date: Date,
#[serde(
rename = "endDate",
serialize_with = "crate::date::serialize",
deserialize_with = "crate::date::deserialize"
)]
end_date: Date,
}
#[derive(Debug, Serialize, Deserialize)]
struct SearchResult {
data: SearchInfo,
}
#[derive(Debug, Serialize, Deserialize)]
struct SearchInfo {
total: u32,
offset: u32,
#[serde(rename = "pageSize")]
page_size: u32,
#[serde(rename = "resData")]
res_data: Vec<IssueData>,
}
#[derive(Debug, Serialize, Deserialize)]
struct IssueData {
#[serde(rename = "editionId")]
edition_id: u32,
#[serde(rename = "pageNumber")]
page_nr: u32,
#[serde(
rename = "publicationDate",
deserialize_with = "crate::date::deserialize"
)]
publication_date: Date,
}
#[derive(Debug, Serialize, Deserialize)]
struct PagesResult {
data: PagesInfo,
}
#[derive(Debug, Serialize, Deserialize)]
struct PagesInfo {
pages: Vec<Page>,
}
#[derive(Debug, Serialize, Deserialize)]
struct Page {
#[serde(rename = "pmPageNumber")]
page_nr: u32,
#[serde(rename = "pageDocUrl")]
doc: PageDoc,
}
#[derive(Debug, Serialize, Deserialize)]
struct PageDoc {
#[serde(rename = "HIGHRES")]
link: PageHighRes,
}
#[derive(Debug, Serialize, Deserialize)]
struct PageHighRes {
url: String,
}
/// A single NZZ issue.
#[derive(Debug, Clone)]
pub struct Issue {
/// Date of publication.
pub publication_date: Date,
/// ordered vector of page urls in the issue.
pub pages: Vec<String>,
}
impl SearchData {
pub fn new(offset: u32, start_date: Date, end_date: Date) -> Self {
Self {
query: "".to_string(),
offset,
sort_field: "media_ts".to_string(),
sort_order: "desc".to_string(),
start_date,
end_date,
}
}
}
/// Search all issues between `from` and `to` (inclusive) using an `offset` into the results.
async fn offset_search(offset: u32, cookie: &str, from: Date, to: Date) -> Result<SearchInfo> {
let data = SearchData::new(offset, from, to);
let result: SearchResult = reqwest::Client::new()
.post(SEARCH_URL)
.header("Cookie", cookie)
.json(&data)
.send()
.await?
.json()
.await?;
Ok(result.data)
}
/// Only keep first pages, they are enough to get the edition id.
fn filter_issues(unfiltered_issues: Vec<IssueData>) -> Vec<IssueData> {
unfiltered_issues
.into_iter()
.filter(|info| info.page_nr == 1)
.collect()
}
/// Search all issues between `from` and `to` (inclusive) respecting pagination.
async fn search(cookie: &str, from: Date, to: Date) -> Result<Vec<IssueData>> {
info!("looking for issues between {from} to {to}");
let mut result = offset_search(0, cookie, from, to).await?;
let mut issues: Vec<IssueData> = filter_issues(result.res_data);
while result.offset + result.page_size < result.total {
result = offset_search(result.offset + result.page_size, cookie, from, to).await?;
issues.extend(filter_issues(result.res_data));
}
Ok(issues)
}
/// Fetch all page urls for the issue with eddition id `edition_id` and order them by page number.
async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
let result: PagesResult = reqwest::Client::new()
.post(ISSUE_URL)
.header("Cookie", cookie)
.json(&serde_json::json!({
"editionId": edition_id,
}))
.send()
.await?
.json()
.await?;
let mut pages: Vec<(u32, String)> = result
.data
.pages
.into_iter()
.map(|page| (page.page_nr, page.doc.link.url))
.collect();
pages.sort_by(|a, b| a.0.cmp(&b.0));
let pages = pages.into_iter().map(|page| page.1).collect();
Ok(pages)
}
/// Fetch all page urls for `issues`.
async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
let mut hydrated_issues = Vec::new();
for issue in issues {
info!(
"fetching page information for issue {}",
issue.publication_date
);
let pages = build_pages(cookie, issue.edition_id).await?;
hydrated_issues.push(Issue {
publication_date: issue.publication_date,
pages,
});
}
Ok(hydrated_issues)
}
/// Fetch issue information in the date range `from`- `to` (inclusive) using `cookie`for
/// authentication.
pub async fn fetch(cookie: &str, from: Date, to: Date) -> Result<Vec<Issue>> {
let issues = search(cookie, from, to).await?;
let issues = build_issues(cookie, issues).await?;
Ok(issues)
}

177
download/src/pdf.rs Normal file
View file

@ -0,0 +1,177 @@
//! Manipulate pdf documents.
use std::{
collections::BTreeMap,
path::{Path, PathBuf},
};
use anyhow::Result;
use lopdf::{Dictionary, Document, Object, ObjectId};
const METADATA_TITLE: &str = "Title";
const METADATA_PRODUCER: &str = "Producer";
const PDF_VERSION: &str = "1.8";
const PRODUCER: &str = "NZZ Downloader";
/// Merge the provided pdfs in the `input` vector to one pdf in `out`, setting its title to
/// `title`.
///
/// The code is from https://github.com/J-F-Liu/lopdf/blob/6b04581640e061bfeb39b585e50a7e9d102b8fe2/examples/merge.rs
/// with some modifications. I have no clue about PDF structure and this is still a bit of a
/// mistery to me.
pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
let mut max_id = 1;
let mut documents_pages = BTreeMap::new();
let mut documents_objects = BTreeMap::new();
let mut merged_doc = Document::with_version(PDF_VERSION);
for pdf in input {
let mut doc = Document::load(pdf)?;
doc.renumber_objects_with(max_id);
max_id = doc.max_id + 1;
documents_pages.extend(
doc.get_pages()
.into_values()
.map(|object_id| (object_id, doc.get_object(object_id).unwrap().to_owned()))
.collect::<BTreeMap<ObjectId, Object>>(),
);
documents_objects.extend(doc.objects);
}
let mut catalog_object: Option<(ObjectId, Object)> = None;
let mut pages_object: Option<(ObjectId, Object)> = None;
// Process all objects except "Page" type
for (object_id, object) in documents_objects.iter() {
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
// All other objects should be collected and inserted into the main Document
match object.type_name().unwrap_or("") {
"Catalog" => {
// Collect a first "Catalog" object and use it for the future "Pages"
catalog_object = Some((
if let Some((id, _)) = catalog_object {
id
} else {
*object_id
},
object.clone(),
));
}
"Pages" => {
// Collect and update a first "Pages" object and use it for the future "Catalog"
// We have also to merge all dictionaries of the old and the new "Pages" object
if let Ok(dictionary) = object.as_dict() {
let mut dictionary = dictionary.clone();
if let Some((_, ref object)) = pages_object {
if let Ok(old_dictionary) = object.as_dict() {
dictionary.extend(old_dictionary);
}
}
pages_object = Some((
if let Some((id, _)) = pages_object {
id
} else {
*object_id
},
Object::Dictionary(dictionary),
));
}
}
"Page" => {} // Ignored, processed later and separately
"Outlines" => {} // Ignored, not supported yet
"Outline" => {} // Ignored, not supported yet
_ => {
merged_doc.max_id += 1;
merged_doc.objects.insert(*object_id, object.clone());
}
}
}
for (object_id, object) in documents_pages.iter() {
if let Ok(dictionary) = object.as_dict() {
let mut dictionary = dictionary.clone();
dictionary.set("Parent", pages_object.as_ref().unwrap().0);
merged_doc
.objects
.insert(*object_id, Object::Dictionary(dictionary));
}
}
let catalog_object = catalog_object.unwrap();
let pages_object = pages_object.unwrap();
// Build a new "Pages" with updated fields
if let Ok(dictionary) = pages_object.1.as_dict() {
let mut dictionary = dictionary.clone();
// Set new pages count
dictionary.set("Count", documents_pages.len() as u32);
// Set new "Kids" list (collected from documents pages) for "Pages"
dictionary.set(
"Kids",
documents_pages
.into_keys()
.map(Object::Reference)
.collect::<Vec<_>>(),
);
merged_doc
.objects
.insert(pages_object.0, Object::Dictionary(dictionary));
}
// Build a new "Catalog" with updated fields
if let Ok(dictionary) = catalog_object.1.as_dict() {
let mut dictionary = dictionary.clone();
dictionary.set("Pages", pages_object.0);
dictionary.remove(b"Outlines"); // Outlines not supported in merged PDFs
merged_doc
.objects
.insert(catalog_object.0, Object::Dictionary(dictionary));
}
merged_doc.trailer.set("Root", catalog_object.0);
set_metadata(METADATA_TITLE, title, &mut merged_doc);
set_metadata(METADATA_PRODUCER, PRODUCER, &mut merged_doc);
// Update the max internal ID as wasn't updated before due to direct objects insertion
merged_doc.max_id = merged_doc.objects.len() as u32;
// Reorder all new Document objects
merged_doc.renumber_objects();
merged_doc.compress();
merged_doc.save(out)?;
Ok(())
}
/// Set metadata `key` to `value`.
///
/// Add the `Info trailer to the pdf document if it does not yet exist.`
fn set_metadata(key: &str, value: &str, doc: &mut Document) {
let info_dict_id = match doc.trailer.get(b"Info") {
Ok(&Object::Reference(id)) => id,
_ => {
// without this the following add_object call overwrites an existing
// object at max_id
doc.max_id += 1;
let id = doc.add_object(Dictionary::new());
doc.trailer.set("Info", Object::Reference(id));
id
}
};
if let Some(Object::Dictionary(ref mut info_dict)) = doc.objects.get_mut(&info_dict_id) {
info_dict.set(key, Object::string_literal(value));
}
}