support different issues on the same day

This commit is contained in:
Sebastian Hugentobler 2025-02-24 10:54:28 +01:00
parent 7eea06d59b
commit 1e594d53ea
9 changed files with 443 additions and 343 deletions

706
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -5,10 +5,10 @@ members = [
] ]
[workspace.dependencies] [workspace.dependencies]
anyhow = "1.0.86" anyhow = "1.0.96"
clap = { version = "4.5.8", features = ["env", "derive"] } clap = { version = "4.5.30", features = ["env", "derive"] }
serde_json = "1.0.120" serde_json = "1.0.120"
tokio = { version = "1", features = ["full"] } tokio = { version = "1.43.0", features = ["full"] }
[workspace.package] [workspace.package]
license = "AGPL-3.0" license = "AGPL-3.0"

View File

@ -1,6 +1,6 @@
[package] [package]
name = "nzz-cookie" name = "nzz-cookie"
version = "0.2.1" version = "0.3.0"
edition = "2021" edition = "2021"
license = { workspace = true } license = { workspace = true }
authors = { workspace = true } authors = { workspace = true }
@ -9,7 +9,7 @@ repository = { workspace = true }
[dependencies] [dependencies]
anyhow = { workspace = true } anyhow = { workspace = true }
clap = { workspace = true } clap = { workspace = true }
fantoccini = "0.19.3" fantoccini = "0.21.4"
serde_json = { workspace = true } serde_json = { workspace = true }
tokio = { workspace = true } tokio = { workspace = true }

View File

@ -1,6 +1,6 @@
[package] [package]
name = "nzz-download" name = "nzz-download"
version = "0.2.1" version = "0.3.0"
edition = "2021" edition = "2021"
license = { workspace = true } license = { workspace = true }
authors = { workspace = true } authors = { workspace = true }
@ -9,12 +9,12 @@ repository = { workspace = true }
[dependencies] [dependencies]
anyhow = { workspace = true } anyhow = { workspace = true }
clap = { workspace = true } clap = { workspace = true }
lopdf = "0.32.0" lopdf = "0.35.0"
reqwest = { version = "0.12.5", features = ["json"] } reqwest = { version = "0.12.12", features = ["json"] }
serde = { version = "1.0.203", features = ["derive"] } serde = { version = "1.0.218", features = ["derive"] }
serde_json = { workspace = true } serde_json = { workspace = true }
tempfile = "3.10.1" tempfile = "3.17.1"
tokio = { workspace = true } tokio = { workspace = true }
time = { version = "0.3.36", features = ["macros", "serde", "formatting", "parsing" ] } time = { version = "0.3.37", features = ["macros", "serde", "formatting", "parsing" ] }
tracing = "0.1.40" tracing = "0.1.41"
tracing-subscriber = "0.3.18" tracing-subscriber = "0.3.19"

View File

@ -17,7 +17,10 @@ const MAX_DOWNLOADS: usize = 4;
/// Fetch a single newspaper issue and save the merged pdf to `output_dir`. /// Fetch a single newspaper issue and save the merged pdf to `output_dir`.
async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> { async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
info!("saving issue {}", issue.publication_date); info!(
"saving issue {} ({})",
issue.publication_date, issue.edition
);
let client = reqwest::Client::new(); let client = reqwest::Client::new();
let tmp_dir = tempfile::tempdir()?; let tmp_dir = tempfile::tempdir()?;
@ -39,9 +42,13 @@ async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
pages.push(tmp_page); pages.push(tmp_page);
} }
let issue_name = format!("nzz_{}.pdf", issue.publication_date); let issue_name = format!(
"nzz_{}_{}.pdf",
issue.publication_date,
issue.edition.to_lowercase()
);
let issue_path = output_dir.join(issue_name); let issue_path = output_dir.join(issue_name);
let issue_title = format!("NZZ {}", issue.publication_date); let issue_title = format!("NZZ {} ({})", issue.publication_date, issue.edition);
pdf::merge(pages, &issue_path, &issue_title)?; pdf::merge(pages, &issue_path, &issue_title)?;
debug!("issue {} saved", issue.publication_date); debug!("issue {} saved", issue.publication_date);

View File

@ -22,7 +22,7 @@ struct SortOption {
struct SearchData { struct SearchData {
query: String, query: String,
offset: u32, offset: u32,
#[serde( #[serde(
rename = "startDate", rename = "startDate",
serialize_with = "crate::date::serialize", serialize_with = "crate::date::serialize",
deserialize_with = "crate::date::deserialize" deserialize_with = "crate::date::deserialize"
@ -35,7 +35,7 @@ struct SearchData {
)] )]
end_date: Date, end_date: Date,
#[serde(rename = "sortOptions")] #[serde(rename = "sortOptions")]
sort_options: Vec<SortOption> sort_options: Vec<SortOption>,
} }
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
@ -64,6 +64,7 @@ struct IssueData {
deserialize_with = "crate::date::deserialize" deserialize_with = "crate::date::deserialize"
)] )]
publication_date: Date, publication_date: Date,
issue: String,
} }
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
@ -100,6 +101,8 @@ struct PageHighRes {
pub struct Issue { pub struct Issue {
/// Date of publication. /// Date of publication.
pub publication_date: Date, pub publication_date: Date,
/// Edition name (used to have multiple issues on the same date).
pub edition: String,
/// ordered vector of page urls in the issue. /// ordered vector of page urls in the issue.
pub pages: Vec<String>, pub pages: Vec<String>,
} }
@ -109,7 +112,10 @@ impl SearchData {
Self { Self {
query: "".to_string(), query: "".to_string(),
offset, offset,
sort_options: vec![SortOption { field: "media_ts".to_string(), order: "desc".to_string() }], sort_options: vec![SortOption {
field: "media_ts".to_string(),
order: "desc".to_string(),
}],
start_date, start_date,
end_date, end_date,
} }
@ -177,6 +183,16 @@ async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
Ok(pages) Ok(pages)
} }
/// Convert the various issue names to their descriptive equivalent.
fn convert_issue_name(issue: &str) -> String {
match issue {
"Zürich" => "01 (Morgenausgabe)".into(),
"Ausgabe 02" => "02 (Mittagausgabe)".into(),
"Ausgabe 03" => "03 (Abendausgabe)".into(),
_ => unreachable!(),
}
}
/// Fetch all page urls for `issues`. /// Fetch all page urls for `issues`.
async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> { async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
let mut hydrated_issues = Vec::new(); let mut hydrated_issues = Vec::new();
@ -197,6 +213,7 @@ async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>
let pages = build_pages(&cookie, issue.edition_id).await?; let pages = build_pages(&cookie, issue.edition_id).await?;
Ok(Issue { Ok(Issue {
publication_date: issue.publication_date, publication_date: issue.publication_date,
edition: convert_issue_name(&issue.issue),
pages, pages,
}) })
}); });

View File

@ -48,8 +48,8 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
for (object_id, object) in documents_objects.iter() { for (object_id, object) in documents_objects.iter() {
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects // We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
// All other objects should be collected and inserted into the main Document // All other objects should be collected and inserted into the main Document
match object.type_name().unwrap_or("") { match object.type_name().unwrap_or(b"") {
"Catalog" => { b"Catalog" => {
// Collect a first "Catalog" object and use it for the future "Pages" // Collect a first "Catalog" object and use it for the future "Pages"
catalog_object = Some(( catalog_object = Some((
if let Some((id, _)) = catalog_object { if let Some((id, _)) = catalog_object {
@ -60,7 +60,7 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
object.clone(), object.clone(),
)); ));
} }
"Pages" => { b"Pages" => {
// Collect and update a first "Pages" object and use it for the future "Catalog" // Collect and update a first "Pages" object and use it for the future "Catalog"
// We have also to merge all dictionaries of the old and the new "Pages" object // We have also to merge all dictionaries of the old and the new "Pages" object
if let Ok(dictionary) = object.as_dict() { if let Ok(dictionary) = object.as_dict() {
@ -81,9 +81,9 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
)); ));
} }
} }
"Page" => {} // Ignored, processed later and separately b"Page" => {} // Ignored, processed later and separately
"Outlines" => {} // Ignored, not supported yet b"Outlines" => {} // Ignored, not supported yet
"Outline" => {} // Ignored, not supported yet b"Outline" => {} // Ignored, not supported yet
_ => { _ => {
merged_doc.max_id += 1; merged_doc.max_id += 1;
merged_doc.objects.insert(*object_id, object.clone()); merged_doc.objects.insert(*object_id, object.clone());

View File

@ -1,4 +1,4 @@
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :) # thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
{ {
description = "little-hesinde project"; description = "little-hesinde project";
inputs = { inputs = {
@ -21,7 +21,7 @@
fenixPkgs: fenixPkgs:
fenixPkgs.fromToolchainFile { fenixPkgs.fromToolchainFile {
file = ./rust-toolchain.toml; file = ./rust-toolchain.toml;
sha256 = "sha256-VZZnlyP69+Y3crrLHQyJirqlHrTtGTsyiSnZB8jEvVo="; sha256 = "sha256-AJ6LX/Q/Er9kS15bn9iflkUwcgYqRQxiOIL2ToVAXaU=";
}; };
buildTargets = { buildTargets = {

View File

@ -1,4 +1,4 @@
[toolchain] [toolchain]
channel = "stable" channel = "stable"
targets = [ "i686-unknown-linux-musl", "aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl" ] targets = [ "i686-unknown-linux-musl", "aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl" ]
profile = "minimal" profile = "default"