rewrite so webdriver is only used for the login
This commit is contained in:
parent
3edb6a7671
commit
cf8fbe0965
26 changed files with 5385 additions and 1296 deletions
1859
download/Cargo.lock
generated
Normal file
1859
download/Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
20
download/Cargo.toml
Normal file
20
download/Cargo.toml
Normal file
|
@ -0,0 +1,20 @@
|
|||
[package]
|
||||
name = "nzz-download"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = { workspace = true }
|
||||
authors = { workspace = true }
|
||||
repository = { workspace = true }
|
||||
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
lopdf = "0.32.0"
|
||||
reqwest = { version = "0.12.5", features = ["json"] }
|
||||
serde = { version = "1.0.203", features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
tempfile = "3.10.1"
|
||||
tokio = { workspace = true }
|
||||
time = { version = "0.3.36", features = ["macros", "serde", "formatting", "parsing" ] }
|
||||
tracing = "0.1.40"
|
||||
tracing-subscriber = "0.3.18"
|
31
download/src/cli.rs
Normal file
31
download/src/cli.rs
Normal file
|
@ -0,0 +1,31 @@
|
|||
//! Cli interface.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use clap::Parser;
|
||||
use time::error::Parse;
|
||||
use time::Date;
|
||||
|
||||
use crate::date::FORMAT;
|
||||
|
||||
/// Parse a date prvided a a cli argument.
|
||||
fn parse_date(input: &str) -> Result<Date, Parse> {
|
||||
Date::parse(input, FORMAT)
|
||||
}
|
||||
|
||||
/// Download issues of the NZZ newspaper
|
||||
#[derive(Parser)]
|
||||
#[command(version, about, long_about = None, after_help = "Provide the authentication cookie from stdin.")]
|
||||
pub struct Config {
|
||||
/// Earliest issue to download (like 1780-12-31)
|
||||
#[arg(short, long, env, value_parser=parse_date)]
|
||||
pub from: Date,
|
||||
|
||||
/// Latest issue to download (like 1780-12-31)
|
||||
#[arg(short, long, env, value_parser=parse_date)]
|
||||
pub to: Date,
|
||||
|
||||
/// Output directory.
|
||||
#[arg(short, long, env, default_value = "./nzz")]
|
||||
pub output_dir: PathBuf,
|
||||
}
|
27
download/src/date.rs
Normal file
27
download/src/date.rs
Normal file
|
@ -0,0 +1,27 @@
|
|||
//! Utilities for handling dates.
|
||||
|
||||
use serde::{Deserialize, Deserializer, Serializer};
|
||||
use time::format_description::FormatItem;
|
||||
use time::macros::format_description;
|
||||
use time::Date;
|
||||
|
||||
/// Date format for newspaper issues (YYYY-mm-dd)
|
||||
pub const FORMAT: &[FormatItem<'_>] = format_description!("[year]-[month]-[day]");
|
||||
|
||||
/// Serialize a date to a String with serde.
|
||||
pub fn serialize<S>(value: &Date, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
let formated = value.format(&FORMAT).unwrap();
|
||||
serializer.serialize_str(&formated)
|
||||
}
|
||||
|
||||
/// Deserialize a String to a Date with serde.
|
||||
pub fn deserialize<'de, D>(deserializer: D) -> Result<Date, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let s: &str = Deserialize::deserialize(deserializer)?;
|
||||
Date::parse(s, FORMAT).map_err(serde::de::Error::custom)
|
||||
}
|
52
download/src/download.rs
Normal file
52
download/src/download.rs
Normal file
|
@ -0,0 +1,52 @@
|
|||
//! Handle downloads of newspaper issues.
|
||||
|
||||
use std::{
|
||||
fs::{self},
|
||||
io::{Cursor, Read},
|
||||
path::Path,
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::{nzz::Issue, pdf};
|
||||
|
||||
/// Download all pages of the provided `issues` and save them merged to the directory `output_dir`.
|
||||
///
|
||||
/// Create `output_dir` if it does not exist.
|
||||
pub async fn fetch(issues: Vec<Issue>, output_dir: &Path) -> Result<()> {
|
||||
debug!("ensuring {output_dir:?} exists");
|
||||
fs::create_dir_all(output_dir)?;
|
||||
|
||||
for issue in issues {
|
||||
info!("saving issue {}", issue.publication_date);
|
||||
|
||||
let tmp_dir = tempfile::tempdir()?;
|
||||
let mut pages = Vec::new();
|
||||
for (i, page) in issue.pages.into_iter().enumerate() {
|
||||
debug!(
|
||||
"fetching issue {}, page {}: {page}",
|
||||
issue.publication_date,
|
||||
i + 1
|
||||
);
|
||||
|
||||
let response = reqwest::Client::new().get(page).send().await?;
|
||||
let mut content = Cursor::new(response.bytes().await?);
|
||||
let mut page_data = Vec::new();
|
||||
content.read_to_end(&mut page_data)?;
|
||||
|
||||
let tmp_page = tmp_dir.path().join(i.to_string());
|
||||
fs::write(&tmp_page, page_data)?;
|
||||
pages.push(tmp_page);
|
||||
}
|
||||
|
||||
let issue_name = format!("nzz_{}.pdf", issue.publication_date);
|
||||
let issue_path = output_dir.join(issue_name);
|
||||
let issue_title = format!("NZZ {}", issue.publication_date);
|
||||
|
||||
pdf::merge(pages, &issue_path, &issue_title)?;
|
||||
debug!("issue {} saved", issue.publication_date);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
19
download/src/lib.rs
Normal file
19
download/src/lib.rs
Normal file
|
@ -0,0 +1,19 @@
|
|||
//! A small utility to download issues of the NZZ newspaper.
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use cli::Config;
|
||||
|
||||
pub mod cli;
|
||||
pub mod date;
|
||||
pub mod download;
|
||||
pub mod nzz;
|
||||
pub mod pdf;
|
||||
|
||||
/// Entry point to download nzz issues.
|
||||
pub async fn run(args: Config, cookie: &str) -> Result<()> {
|
||||
let issues = nzz::fetch(cookie, args.from, args.to).await?;
|
||||
download::fetch(issues, &args.output_dir).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
29
download/src/main.rs
Normal file
29
download/src/main.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use std::io::{self, Read};
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use nzz_download::cli::Config;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
if std::env::var_os("RUST_LOG").is_none() {
|
||||
std::env::set_var("RUST_LOG", "info");
|
||||
}
|
||||
|
||||
tracing_subscriber::fmt::init();
|
||||
|
||||
let args = Config::parse();
|
||||
let cookie = read_cookie().expect("Provide the authentication cookie via stdin");
|
||||
|
||||
nzz_download::run(args, &cookie).await
|
||||
}
|
||||
|
||||
/// read authentication cookie from stdin.
|
||||
fn read_cookie() -> Result<String> {
|
||||
let stdin = io::stdin();
|
||||
let mut buffer = String::new();
|
||||
|
||||
stdin.lock().read_to_string(&mut buffer)?;
|
||||
let cookie = buffer.trim();
|
||||
Ok(cookie.to_string())
|
||||
}
|
198
download/src/nzz.rs
Normal file
198
download/src/nzz.rs
Normal file
|
@ -0,0 +1,198 @@
|
|||
//! Handle information relating to NZZ issues.
|
||||
|
||||
use anyhow::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use time::Date;
|
||||
use tracing::info;
|
||||
|
||||
const SEARCH_URL: &str = "https://zeitungsarchiv.nzz.ch/solr-epaper-search/1.0/search";
|
||||
const ISSUE_URL: &str = "https://zeitungsarchiv.nzz.ch/archive/1.0/getPages";
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct SearchData {
|
||||
query: String,
|
||||
offset: u32,
|
||||
#[serde(rename = "sortField")]
|
||||
sort_field: String,
|
||||
#[serde(rename = "sortOrder")]
|
||||
sort_order: String,
|
||||
#[serde(
|
||||
rename = "startDate",
|
||||
serialize_with = "crate::date::serialize",
|
||||
deserialize_with = "crate::date::deserialize"
|
||||
)]
|
||||
start_date: Date,
|
||||
#[serde(
|
||||
rename = "endDate",
|
||||
serialize_with = "crate::date::serialize",
|
||||
deserialize_with = "crate::date::deserialize"
|
||||
)]
|
||||
end_date: Date,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct SearchResult {
|
||||
data: SearchInfo,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct SearchInfo {
|
||||
total: u32,
|
||||
offset: u32,
|
||||
#[serde(rename = "pageSize")]
|
||||
page_size: u32,
|
||||
#[serde(rename = "resData")]
|
||||
res_data: Vec<IssueData>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct IssueData {
|
||||
#[serde(rename = "editionId")]
|
||||
edition_id: u32,
|
||||
#[serde(rename = "pageNumber")]
|
||||
page_nr: u32,
|
||||
#[serde(
|
||||
rename = "publicationDate",
|
||||
deserialize_with = "crate::date::deserialize"
|
||||
)]
|
||||
publication_date: Date,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct PagesResult {
|
||||
data: PagesInfo,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct PagesInfo {
|
||||
pages: Vec<Page>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct Page {
|
||||
#[serde(rename = "pmPageNumber")]
|
||||
page_nr: u32,
|
||||
#[serde(rename = "pageDocUrl")]
|
||||
doc: PageDoc,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct PageDoc {
|
||||
#[serde(rename = "HIGHRES")]
|
||||
link: PageHighRes,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct PageHighRes {
|
||||
url: String,
|
||||
}
|
||||
|
||||
/// A single NZZ issue.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Issue {
|
||||
/// Date of publication.
|
||||
pub publication_date: Date,
|
||||
/// ordered vector of page urls in the issue.
|
||||
pub pages: Vec<String>,
|
||||
}
|
||||
|
||||
impl SearchData {
|
||||
pub fn new(offset: u32, start_date: Date, end_date: Date) -> Self {
|
||||
Self {
|
||||
query: "".to_string(),
|
||||
offset,
|
||||
sort_field: "media_ts".to_string(),
|
||||
sort_order: "desc".to_string(),
|
||||
start_date,
|
||||
end_date,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Search all issues between `from` and `to` (inclusive) using an `offset` into the results.
|
||||
async fn offset_search(offset: u32, cookie: &str, from: Date, to: Date) -> Result<SearchInfo> {
|
||||
let data = SearchData::new(offset, from, to);
|
||||
let result: SearchResult = reqwest::Client::new()
|
||||
.post(SEARCH_URL)
|
||||
.header("Cookie", cookie)
|
||||
.json(&data)
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
Ok(result.data)
|
||||
}
|
||||
|
||||
/// Only keep first pages, they are enough to get the edition id.
|
||||
fn filter_issues(unfiltered_issues: Vec<IssueData>) -> Vec<IssueData> {
|
||||
unfiltered_issues
|
||||
.into_iter()
|
||||
.filter(|info| info.page_nr == 1)
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Search all issues between `from` and `to` (inclusive) respecting pagination.
|
||||
async fn search(cookie: &str, from: Date, to: Date) -> Result<Vec<IssueData>> {
|
||||
info!("looking for issues between {from} to {to}");
|
||||
let mut result = offset_search(0, cookie, from, to).await?;
|
||||
let mut issues: Vec<IssueData> = filter_issues(result.res_data);
|
||||
|
||||
while result.offset + result.page_size < result.total {
|
||||
result = offset_search(result.offset + result.page_size, cookie, from, to).await?;
|
||||
issues.extend(filter_issues(result.res_data));
|
||||
}
|
||||
|
||||
Ok(issues)
|
||||
}
|
||||
|
||||
/// Fetch all page urls for the issue with eddition id `edition_id` and order them by page number.
|
||||
async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
|
||||
let result: PagesResult = reqwest::Client::new()
|
||||
.post(ISSUE_URL)
|
||||
.header("Cookie", cookie)
|
||||
.json(&serde_json::json!({
|
||||
"editionId": edition_id,
|
||||
}))
|
||||
.send()
|
||||
.await?
|
||||
.json()
|
||||
.await?;
|
||||
|
||||
let mut pages: Vec<(u32, String)> = result
|
||||
.data
|
||||
.pages
|
||||
.into_iter()
|
||||
.map(|page| (page.page_nr, page.doc.link.url))
|
||||
.collect();
|
||||
pages.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
let pages = pages.into_iter().map(|page| page.1).collect();
|
||||
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
/// Fetch all page urls for `issues`.
|
||||
async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
|
||||
let mut hydrated_issues = Vec::new();
|
||||
for issue in issues {
|
||||
info!(
|
||||
"fetching page information for issue {}",
|
||||
issue.publication_date
|
||||
);
|
||||
let pages = build_pages(cookie, issue.edition_id).await?;
|
||||
hydrated_issues.push(Issue {
|
||||
publication_date: issue.publication_date,
|
||||
pages,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(hydrated_issues)
|
||||
}
|
||||
|
||||
/// Fetch issue information in the date range `from`- `to` (inclusive) using `cookie`for
|
||||
/// authentication.
|
||||
pub async fn fetch(cookie: &str, from: Date, to: Date) -> Result<Vec<Issue>> {
|
||||
let issues = search(cookie, from, to).await?;
|
||||
let issues = build_issues(cookie, issues).await?;
|
||||
|
||||
Ok(issues)
|
||||
}
|
177
download/src/pdf.rs
Normal file
177
download/src/pdf.rs
Normal file
|
@ -0,0 +1,177 @@
|
|||
//! Manipulate pdf documents.
|
||||
|
||||
use std::{
|
||||
collections::BTreeMap,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::Result;
|
||||
use lopdf::{Dictionary, Document, Object, ObjectId};
|
||||
|
||||
const METADATA_TITLE: &str = "Title";
|
||||
const METADATA_PRODUCER: &str = "Producer";
|
||||
const PDF_VERSION: &str = "1.8";
|
||||
const PRODUCER: &str = "NZZ Downloader";
|
||||
|
||||
/// Merge the provided pdfs in the `input` vector to one pdf in `out`, setting its title to
|
||||
/// `title`.
|
||||
///
|
||||
/// The code is from https://github.com/J-F-Liu/lopdf/blob/6b04581640e061bfeb39b585e50a7e9d102b8fe2/examples/merge.rs
|
||||
/// with some modifications. I have no clue about PDF structure and this is still a bit of a
|
||||
/// mistery to me.
|
||||
pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
|
||||
let mut max_id = 1;
|
||||
let mut documents_pages = BTreeMap::new();
|
||||
let mut documents_objects = BTreeMap::new();
|
||||
let mut merged_doc = Document::with_version(PDF_VERSION);
|
||||
|
||||
for pdf in input {
|
||||
let mut doc = Document::load(pdf)?;
|
||||
|
||||
doc.renumber_objects_with(max_id);
|
||||
|
||||
max_id = doc.max_id + 1;
|
||||
|
||||
documents_pages.extend(
|
||||
doc.get_pages()
|
||||
.into_values()
|
||||
.map(|object_id| (object_id, doc.get_object(object_id).unwrap().to_owned()))
|
||||
.collect::<BTreeMap<ObjectId, Object>>(),
|
||||
);
|
||||
documents_objects.extend(doc.objects);
|
||||
}
|
||||
|
||||
let mut catalog_object: Option<(ObjectId, Object)> = None;
|
||||
let mut pages_object: Option<(ObjectId, Object)> = None;
|
||||
|
||||
// Process all objects except "Page" type
|
||||
for (object_id, object) in documents_objects.iter() {
|
||||
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
|
||||
// All other objects should be collected and inserted into the main Document
|
||||
match object.type_name().unwrap_or("") {
|
||||
"Catalog" => {
|
||||
// Collect a first "Catalog" object and use it for the future "Pages"
|
||||
catalog_object = Some((
|
||||
if let Some((id, _)) = catalog_object {
|
||||
id
|
||||
} else {
|
||||
*object_id
|
||||
},
|
||||
object.clone(),
|
||||
));
|
||||
}
|
||||
"Pages" => {
|
||||
// Collect and update a first "Pages" object and use it for the future "Catalog"
|
||||
// We have also to merge all dictionaries of the old and the new "Pages" object
|
||||
if let Ok(dictionary) = object.as_dict() {
|
||||
let mut dictionary = dictionary.clone();
|
||||
if let Some((_, ref object)) = pages_object {
|
||||
if let Ok(old_dictionary) = object.as_dict() {
|
||||
dictionary.extend(old_dictionary);
|
||||
}
|
||||
}
|
||||
|
||||
pages_object = Some((
|
||||
if let Some((id, _)) = pages_object {
|
||||
id
|
||||
} else {
|
||||
*object_id
|
||||
},
|
||||
Object::Dictionary(dictionary),
|
||||
));
|
||||
}
|
||||
}
|
||||
"Page" => {} // Ignored, processed later and separately
|
||||
"Outlines" => {} // Ignored, not supported yet
|
||||
"Outline" => {} // Ignored, not supported yet
|
||||
_ => {
|
||||
merged_doc.max_id += 1;
|
||||
merged_doc.objects.insert(*object_id, object.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (object_id, object) in documents_pages.iter() {
|
||||
if let Ok(dictionary) = object.as_dict() {
|
||||
let mut dictionary = dictionary.clone();
|
||||
dictionary.set("Parent", pages_object.as_ref().unwrap().0);
|
||||
|
||||
merged_doc
|
||||
.objects
|
||||
.insert(*object_id, Object::Dictionary(dictionary));
|
||||
}
|
||||
}
|
||||
|
||||
let catalog_object = catalog_object.unwrap();
|
||||
let pages_object = pages_object.unwrap();
|
||||
|
||||
// Build a new "Pages" with updated fields
|
||||
if let Ok(dictionary) = pages_object.1.as_dict() {
|
||||
let mut dictionary = dictionary.clone();
|
||||
|
||||
// Set new pages count
|
||||
dictionary.set("Count", documents_pages.len() as u32);
|
||||
|
||||
// Set new "Kids" list (collected from documents pages) for "Pages"
|
||||
dictionary.set(
|
||||
"Kids",
|
||||
documents_pages
|
||||
.into_keys()
|
||||
.map(Object::Reference)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
|
||||
merged_doc
|
||||
.objects
|
||||
.insert(pages_object.0, Object::Dictionary(dictionary));
|
||||
}
|
||||
|
||||
// Build a new "Catalog" with updated fields
|
||||
if let Ok(dictionary) = catalog_object.1.as_dict() {
|
||||
let mut dictionary = dictionary.clone();
|
||||
dictionary.set("Pages", pages_object.0);
|
||||
dictionary.remove(b"Outlines"); // Outlines not supported in merged PDFs
|
||||
|
||||
merged_doc
|
||||
.objects
|
||||
.insert(catalog_object.0, Object::Dictionary(dictionary));
|
||||
}
|
||||
|
||||
merged_doc.trailer.set("Root", catalog_object.0);
|
||||
|
||||
set_metadata(METADATA_TITLE, title, &mut merged_doc);
|
||||
set_metadata(METADATA_PRODUCER, PRODUCER, &mut merged_doc);
|
||||
|
||||
// Update the max internal ID as wasn't updated before due to direct objects insertion
|
||||
merged_doc.max_id = merged_doc.objects.len() as u32;
|
||||
|
||||
// Reorder all new Document objects
|
||||
merged_doc.renumber_objects();
|
||||
|
||||
merged_doc.compress();
|
||||
merged_doc.save(out)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Set metadata `key` to `value`.
|
||||
///
|
||||
/// Add the `Info trailer to the pdf document if it does not yet exist.`
|
||||
fn set_metadata(key: &str, value: &str, doc: &mut Document) {
|
||||
let info_dict_id = match doc.trailer.get(b"Info") {
|
||||
Ok(&Object::Reference(id)) => id,
|
||||
_ => {
|
||||
// without this the following add_object call overwrites an existing
|
||||
// object at max_id
|
||||
doc.max_id += 1;
|
||||
|
||||
let id = doc.add_object(Dictionary::new());
|
||||
doc.trailer.set("Info", Object::Reference(id));
|
||||
id
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(Object::Dictionary(ref mut info_dict)) = doc.objects.get_mut(&info_dict_id) {
|
||||
info_dict.set(key, Object::string_literal(value));
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue