support different issues on the same day
This commit is contained in:
parent
7eea06d59b
commit
1e594d53ea
706
Cargo.lock
generated
706
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -5,10 +5,10 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.dependencies]
|
||||
anyhow = "1.0.86"
|
||||
clap = { version = "4.5.8", features = ["env", "derive"] }
|
||||
anyhow = "1.0.96"
|
||||
clap = { version = "4.5.30", features = ["env", "derive"] }
|
||||
serde_json = "1.0.120"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
tokio = { version = "1.43.0", features = ["full"] }
|
||||
|
||||
[workspace.package]
|
||||
license = "AGPL-3.0"
|
||||
|
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "nzz-cookie"
|
||||
version = "0.2.1"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
license = { workspace = true }
|
||||
authors = { workspace = true }
|
||||
@ -9,7 +9,7 @@ repository = { workspace = true }
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
fantoccini = "0.19.3"
|
||||
fantoccini = "0.21.4"
|
||||
serde_json = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "nzz-download"
|
||||
version = "0.2.1"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
license = { workspace = true }
|
||||
authors = { workspace = true }
|
||||
@ -9,12 +9,12 @@ repository = { workspace = true }
|
||||
[dependencies]
|
||||
anyhow = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
lopdf = "0.32.0"
|
||||
reqwest = { version = "0.12.5", features = ["json"] }
|
||||
serde = { version = "1.0.203", features = ["derive"] }
|
||||
lopdf = "0.35.0"
|
||||
reqwest = { version = "0.12.12", features = ["json"] }
|
||||
serde = { version = "1.0.218", features = ["derive"] }
|
||||
serde_json = { workspace = true }
|
||||
tempfile = "3.10.1"
|
||||
tempfile = "3.17.1"
|
||||
tokio = { workspace = true }
|
||||
time = { version = "0.3.36", features = ["macros", "serde", "formatting", "parsing" ] }
|
||||
tracing = "0.1.40"
|
||||
tracing-subscriber = "0.3.18"
|
||||
time = { version = "0.3.37", features = ["macros", "serde", "formatting", "parsing" ] }
|
||||
tracing = "0.1.41"
|
||||
tracing-subscriber = "0.3.19"
|
||||
|
@ -17,7 +17,10 @@ const MAX_DOWNLOADS: usize = 4;
|
||||
|
||||
/// Fetch a single newspaper issue and save the merged pdf to `output_dir`.
|
||||
async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
|
||||
info!("saving issue {}", issue.publication_date);
|
||||
info!(
|
||||
"saving issue {} ({})",
|
||||
issue.publication_date, issue.edition
|
||||
);
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let tmp_dir = tempfile::tempdir()?;
|
||||
@ -39,9 +42,13 @@ async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
|
||||
pages.push(tmp_page);
|
||||
}
|
||||
|
||||
let issue_name = format!("nzz_{}.pdf", issue.publication_date);
|
||||
let issue_name = format!(
|
||||
"nzz_{}_{}.pdf",
|
||||
issue.publication_date,
|
||||
issue.edition.to_lowercase()
|
||||
);
|
||||
let issue_path = output_dir.join(issue_name);
|
||||
let issue_title = format!("NZZ {}", issue.publication_date);
|
||||
let issue_title = format!("NZZ {} ({})", issue.publication_date, issue.edition);
|
||||
|
||||
pdf::merge(pages, &issue_path, &issue_title)?;
|
||||
debug!("issue {} saved", issue.publication_date);
|
||||
|
@ -22,7 +22,7 @@ struct SortOption {
|
||||
struct SearchData {
|
||||
query: String,
|
||||
offset: u32,
|
||||
#[serde(
|
||||
#[serde(
|
||||
rename = "startDate",
|
||||
serialize_with = "crate::date::serialize",
|
||||
deserialize_with = "crate::date::deserialize"
|
||||
@ -35,7 +35,7 @@ struct SearchData {
|
||||
)]
|
||||
end_date: Date,
|
||||
#[serde(rename = "sortOptions")]
|
||||
sort_options: Vec<SortOption>
|
||||
sort_options: Vec<SortOption>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@ -64,6 +64,7 @@ struct IssueData {
|
||||
deserialize_with = "crate::date::deserialize"
|
||||
)]
|
||||
publication_date: Date,
|
||||
issue: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
@ -100,6 +101,8 @@ struct PageHighRes {
|
||||
pub struct Issue {
|
||||
/// Date of publication.
|
||||
pub publication_date: Date,
|
||||
/// Edition name (used to have multiple issues on the same date).
|
||||
pub edition: String,
|
||||
/// ordered vector of page urls in the issue.
|
||||
pub pages: Vec<String>,
|
||||
}
|
||||
@ -109,7 +112,10 @@ impl SearchData {
|
||||
Self {
|
||||
query: "".to_string(),
|
||||
offset,
|
||||
sort_options: vec![SortOption { field: "media_ts".to_string(), order: "desc".to_string() }],
|
||||
sort_options: vec![SortOption {
|
||||
field: "media_ts".to_string(),
|
||||
order: "desc".to_string(),
|
||||
}],
|
||||
start_date,
|
||||
end_date,
|
||||
}
|
||||
@ -177,6 +183,16 @@ async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
|
||||
Ok(pages)
|
||||
}
|
||||
|
||||
/// Convert the various issue names to their descriptive equivalent.
|
||||
fn convert_issue_name(issue: &str) -> String {
|
||||
match issue {
|
||||
"Zürich" => "01 (Morgenausgabe)".into(),
|
||||
"Ausgabe 02" => "02 (Mittagausgabe)".into(),
|
||||
"Ausgabe 03" => "03 (Abendausgabe)".into(),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch all page urls for `issues`.
|
||||
async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
|
||||
let mut hydrated_issues = Vec::new();
|
||||
@ -197,6 +213,7 @@ async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>
|
||||
let pages = build_pages(&cookie, issue.edition_id).await?;
|
||||
Ok(Issue {
|
||||
publication_date: issue.publication_date,
|
||||
edition: convert_issue_name(&issue.issue),
|
||||
pages,
|
||||
})
|
||||
});
|
||||
|
@ -48,8 +48,8 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
|
||||
for (object_id, object) in documents_objects.iter() {
|
||||
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
|
||||
// All other objects should be collected and inserted into the main Document
|
||||
match object.type_name().unwrap_or("") {
|
||||
"Catalog" => {
|
||||
match object.type_name().unwrap_or(b"") {
|
||||
b"Catalog" => {
|
||||
// Collect a first "Catalog" object and use it for the future "Pages"
|
||||
catalog_object = Some((
|
||||
if let Some((id, _)) = catalog_object {
|
||||
@ -60,7 +60,7 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
|
||||
object.clone(),
|
||||
));
|
||||
}
|
||||
"Pages" => {
|
||||
b"Pages" => {
|
||||
// Collect and update a first "Pages" object and use it for the future "Catalog"
|
||||
// We have also to merge all dictionaries of the old and the new "Pages" object
|
||||
if let Ok(dictionary) = object.as_dict() {
|
||||
@ -81,9 +81,9 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
|
||||
));
|
||||
}
|
||||
}
|
||||
"Page" => {} // Ignored, processed later and separately
|
||||
"Outlines" => {} // Ignored, not supported yet
|
||||
"Outline" => {} // Ignored, not supported yet
|
||||
b"Page" => {} // Ignored, processed later and separately
|
||||
b"Outlines" => {} // Ignored, not supported yet
|
||||
b"Outline" => {} // Ignored, not supported yet
|
||||
_ => {
|
||||
merged_doc.max_id += 1;
|
||||
merged_doc.objects.insert(*object_id, object.clone());
|
||||
|
@ -1,4 +1,4 @@
|
||||
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
|
||||
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
|
||||
{
|
||||
description = "little-hesinde project";
|
||||
inputs = {
|
||||
@ -21,7 +21,7 @@
|
||||
fenixPkgs:
|
||||
fenixPkgs.fromToolchainFile {
|
||||
file = ./rust-toolchain.toml;
|
||||
sha256 = "sha256-VZZnlyP69+Y3crrLHQyJirqlHrTtGTsyiSnZB8jEvVo=";
|
||||
sha256 = "sha256-AJ6LX/Q/Er9kS15bn9iflkUwcgYqRQxiOIL2ToVAXaU=";
|
||||
};
|
||||
|
||||
buildTargets = {
|
||||
|
@ -1,4 +1,4 @@
|
||||
[toolchain]
|
||||
channel = "stable"
|
||||
targets = [ "i686-unknown-linux-musl", "aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl" ]
|
||||
profile = "minimal"
|
||||
profile = "default"
|
||||
|
Loading…
Reference in New Issue
Block a user