support different issues on the same day

This commit is contained in:
Sebastian Hugentobler 2025-02-24 10:54:28 +01:00
parent 7eea06d59b
commit 1e594d53ea
9 changed files with 443 additions and 343 deletions

706
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -5,10 +5,10 @@ members = [
]
[workspace.dependencies]
anyhow = "1.0.86"
clap = { version = "4.5.8", features = ["env", "derive"] }
anyhow = "1.0.96"
clap = { version = "4.5.30", features = ["env", "derive"] }
serde_json = "1.0.120"
tokio = { version = "1", features = ["full"] }
tokio = { version = "1.43.0", features = ["full"] }
[workspace.package]
license = "AGPL-3.0"

View File

@ -1,6 +1,6 @@
[package]
name = "nzz-cookie"
version = "0.2.1"
version = "0.3.0"
edition = "2021"
license = { workspace = true }
authors = { workspace = true }
@ -9,7 +9,7 @@ repository = { workspace = true }
[dependencies]
anyhow = { workspace = true }
clap = { workspace = true }
fantoccini = "0.19.3"
fantoccini = "0.21.4"
serde_json = { workspace = true }
tokio = { workspace = true }

View File

@ -1,6 +1,6 @@
[package]
name = "nzz-download"
version = "0.2.1"
version = "0.3.0"
edition = "2021"
license = { workspace = true }
authors = { workspace = true }
@ -9,12 +9,12 @@ repository = { workspace = true }
[dependencies]
anyhow = { workspace = true }
clap = { workspace = true }
lopdf = "0.32.0"
reqwest = { version = "0.12.5", features = ["json"] }
serde = { version = "1.0.203", features = ["derive"] }
lopdf = "0.35.0"
reqwest = { version = "0.12.12", features = ["json"] }
serde = { version = "1.0.218", features = ["derive"] }
serde_json = { workspace = true }
tempfile = "3.10.1"
tempfile = "3.17.1"
tokio = { workspace = true }
time = { version = "0.3.36", features = ["macros", "serde", "formatting", "parsing" ] }
tracing = "0.1.40"
tracing-subscriber = "0.3.18"
time = { version = "0.3.37", features = ["macros", "serde", "formatting", "parsing" ] }
tracing = "0.1.41"
tracing-subscriber = "0.3.19"

View File

@ -17,7 +17,10 @@ const MAX_DOWNLOADS: usize = 4;
/// Fetch a single newspaper issue and save the merged pdf to `output_dir`.
async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
info!("saving issue {}", issue.publication_date);
info!(
"saving issue {} ({})",
issue.publication_date, issue.edition
);
let client = reqwest::Client::new();
let tmp_dir = tempfile::tempdir()?;
@ -39,9 +42,13 @@ async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
pages.push(tmp_page);
}
let issue_name = format!("nzz_{}.pdf", issue.publication_date);
let issue_name = format!(
"nzz_{}_{}.pdf",
issue.publication_date,
issue.edition.to_lowercase()
);
let issue_path = output_dir.join(issue_name);
let issue_title = format!("NZZ {}", issue.publication_date);
let issue_title = format!("NZZ {} ({})", issue.publication_date, issue.edition);
pdf::merge(pages, &issue_path, &issue_title)?;
debug!("issue {} saved", issue.publication_date);

View File

@ -22,7 +22,7 @@ struct SortOption {
struct SearchData {
query: String,
offset: u32,
#[serde(
#[serde(
rename = "startDate",
serialize_with = "crate::date::serialize",
deserialize_with = "crate::date::deserialize"
@ -35,7 +35,7 @@ struct SearchData {
)]
end_date: Date,
#[serde(rename = "sortOptions")]
sort_options: Vec<SortOption>
sort_options: Vec<SortOption>,
}
#[derive(Debug, Serialize, Deserialize)]
@ -64,6 +64,7 @@ struct IssueData {
deserialize_with = "crate::date::deserialize"
)]
publication_date: Date,
issue: String,
}
#[derive(Debug, Serialize, Deserialize)]
@ -100,6 +101,8 @@ struct PageHighRes {
pub struct Issue {
/// Date of publication.
pub publication_date: Date,
/// Edition name (used to have multiple issues on the same date).
pub edition: String,
/// ordered vector of page urls in the issue.
pub pages: Vec<String>,
}
@ -109,7 +112,10 @@ impl SearchData {
Self {
query: "".to_string(),
offset,
sort_options: vec![SortOption { field: "media_ts".to_string(), order: "desc".to_string() }],
sort_options: vec![SortOption {
field: "media_ts".to_string(),
order: "desc".to_string(),
}],
start_date,
end_date,
}
@ -177,6 +183,16 @@ async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
Ok(pages)
}
/// Convert the various issue names to their descriptive equivalent.
fn convert_issue_name(issue: &str) -> String {
match issue {
"Zürich" => "01 (Morgenausgabe)".into(),
"Ausgabe 02" => "02 (Mittagausgabe)".into(),
"Ausgabe 03" => "03 (Abendausgabe)".into(),
_ => unreachable!(),
}
}
/// Fetch all page urls for `issues`.
async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
let mut hydrated_issues = Vec::new();
@ -197,6 +213,7 @@ async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>
let pages = build_pages(&cookie, issue.edition_id).await?;
Ok(Issue {
publication_date: issue.publication_date,
edition: convert_issue_name(&issue.issue),
pages,
})
});

View File

@ -48,8 +48,8 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
for (object_id, object) in documents_objects.iter() {
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
// All other objects should be collected and inserted into the main Document
match object.type_name().unwrap_or("") {
"Catalog" => {
match object.type_name().unwrap_or(b"") {
b"Catalog" => {
// Collect a first "Catalog" object and use it for the future "Pages"
catalog_object = Some((
if let Some((id, _)) = catalog_object {
@ -60,7 +60,7 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
object.clone(),
));
}
"Pages" => {
b"Pages" => {
// Collect and update a first "Pages" object and use it for the future "Catalog"
// We have also to merge all dictionaries of the old and the new "Pages" object
if let Ok(dictionary) = object.as_dict() {
@ -81,9 +81,9 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
));
}
}
"Page" => {} // Ignored, processed later and separately
"Outlines" => {} // Ignored, not supported yet
"Outline" => {} // Ignored, not supported yet
b"Page" => {} // Ignored, processed later and separately
b"Outlines" => {} // Ignored, not supported yet
b"Outline" => {} // Ignored, not supported yet
_ => {
merged_doc.max_id += 1;
merged_doc.objects.insert(*object_id, object.clone());

View File

@ -1,4 +1,4 @@
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
{
description = "little-hesinde project";
inputs = {
@ -21,7 +21,7 @@
fenixPkgs:
fenixPkgs.fromToolchainFile {
file = ./rust-toolchain.toml;
sha256 = "sha256-VZZnlyP69+Y3crrLHQyJirqlHrTtGTsyiSnZB8jEvVo=";
sha256 = "sha256-AJ6LX/Q/Er9kS15bn9iflkUwcgYqRQxiOIL2ToVAXaU=";
};
buildTargets = {

View File

@ -1,4 +1,4 @@
[toolchain]
channel = "stable"
targets = [ "i686-unknown-linux-musl", "aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl" ]
profile = "minimal"
profile = "default"