support different issues on the same day
This commit is contained in:
parent
7eea06d59b
commit
1e594d53ea
706
Cargo.lock
generated
706
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -5,10 +5,10 @@ members = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[workspace.dependencies]
|
[workspace.dependencies]
|
||||||
anyhow = "1.0.86"
|
anyhow = "1.0.96"
|
||||||
clap = { version = "4.5.8", features = ["env", "derive"] }
|
clap = { version = "4.5.30", features = ["env", "derive"] }
|
||||||
serde_json = "1.0.120"
|
serde_json = "1.0.120"
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1.43.0", features = ["full"] }
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
license = "AGPL-3.0"
|
license = "AGPL-3.0"
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "nzz-cookie"
|
name = "nzz-cookie"
|
||||||
version = "0.2.1"
|
version = "0.3.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
license = { workspace = true }
|
license = { workspace = true }
|
||||||
authors = { workspace = true }
|
authors = { workspace = true }
|
||||||
@ -9,7 +9,7 @@ repository = { workspace = true }
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
clap = { workspace = true }
|
clap = { workspace = true }
|
||||||
fantoccini = "0.19.3"
|
fantoccini = "0.21.4"
|
||||||
serde_json = { workspace = true }
|
serde_json = { workspace = true }
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "nzz-download"
|
name = "nzz-download"
|
||||||
version = "0.2.1"
|
version = "0.3.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
license = { workspace = true }
|
license = { workspace = true }
|
||||||
authors = { workspace = true }
|
authors = { workspace = true }
|
||||||
@ -9,12 +9,12 @@ repository = { workspace = true }
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
clap = { workspace = true }
|
clap = { workspace = true }
|
||||||
lopdf = "0.32.0"
|
lopdf = "0.35.0"
|
||||||
reqwest = { version = "0.12.5", features = ["json"] }
|
reqwest = { version = "0.12.12", features = ["json"] }
|
||||||
serde = { version = "1.0.203", features = ["derive"] }
|
serde = { version = "1.0.218", features = ["derive"] }
|
||||||
serde_json = { workspace = true }
|
serde_json = { workspace = true }
|
||||||
tempfile = "3.10.1"
|
tempfile = "3.17.1"
|
||||||
tokio = { workspace = true }
|
tokio = { workspace = true }
|
||||||
time = { version = "0.3.36", features = ["macros", "serde", "formatting", "parsing" ] }
|
time = { version = "0.3.37", features = ["macros", "serde", "formatting", "parsing" ] }
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.41"
|
||||||
tracing-subscriber = "0.3.18"
|
tracing-subscriber = "0.3.19"
|
||||||
|
@ -17,7 +17,10 @@ const MAX_DOWNLOADS: usize = 4;
|
|||||||
|
|
||||||
/// Fetch a single newspaper issue and save the merged pdf to `output_dir`.
|
/// Fetch a single newspaper issue and save the merged pdf to `output_dir`.
|
||||||
async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
|
async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
|
||||||
info!("saving issue {}", issue.publication_date);
|
info!(
|
||||||
|
"saving issue {} ({})",
|
||||||
|
issue.publication_date, issue.edition
|
||||||
|
);
|
||||||
|
|
||||||
let client = reqwest::Client::new();
|
let client = reqwest::Client::new();
|
||||||
let tmp_dir = tempfile::tempdir()?;
|
let tmp_dir = tempfile::tempdir()?;
|
||||||
@ -39,9 +42,13 @@ async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
|
|||||||
pages.push(tmp_page);
|
pages.push(tmp_page);
|
||||||
}
|
}
|
||||||
|
|
||||||
let issue_name = format!("nzz_{}.pdf", issue.publication_date);
|
let issue_name = format!(
|
||||||
|
"nzz_{}_{}.pdf",
|
||||||
|
issue.publication_date,
|
||||||
|
issue.edition.to_lowercase()
|
||||||
|
);
|
||||||
let issue_path = output_dir.join(issue_name);
|
let issue_path = output_dir.join(issue_name);
|
||||||
let issue_title = format!("NZZ {}", issue.publication_date);
|
let issue_title = format!("NZZ {} ({})", issue.publication_date, issue.edition);
|
||||||
|
|
||||||
pdf::merge(pages, &issue_path, &issue_title)?;
|
pdf::merge(pages, &issue_path, &issue_title)?;
|
||||||
debug!("issue {} saved", issue.publication_date);
|
debug!("issue {} saved", issue.publication_date);
|
||||||
|
@ -22,7 +22,7 @@ struct SortOption {
|
|||||||
struct SearchData {
|
struct SearchData {
|
||||||
query: String,
|
query: String,
|
||||||
offset: u32,
|
offset: u32,
|
||||||
#[serde(
|
#[serde(
|
||||||
rename = "startDate",
|
rename = "startDate",
|
||||||
serialize_with = "crate::date::serialize",
|
serialize_with = "crate::date::serialize",
|
||||||
deserialize_with = "crate::date::deserialize"
|
deserialize_with = "crate::date::deserialize"
|
||||||
@ -35,7 +35,7 @@ struct SearchData {
|
|||||||
)]
|
)]
|
||||||
end_date: Date,
|
end_date: Date,
|
||||||
#[serde(rename = "sortOptions")]
|
#[serde(rename = "sortOptions")]
|
||||||
sort_options: Vec<SortOption>
|
sort_options: Vec<SortOption>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
@ -64,6 +64,7 @@ struct IssueData {
|
|||||||
deserialize_with = "crate::date::deserialize"
|
deserialize_with = "crate::date::deserialize"
|
||||||
)]
|
)]
|
||||||
publication_date: Date,
|
publication_date: Date,
|
||||||
|
issue: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
@ -100,6 +101,8 @@ struct PageHighRes {
|
|||||||
pub struct Issue {
|
pub struct Issue {
|
||||||
/// Date of publication.
|
/// Date of publication.
|
||||||
pub publication_date: Date,
|
pub publication_date: Date,
|
||||||
|
/// Edition name (used to have multiple issues on the same date).
|
||||||
|
pub edition: String,
|
||||||
/// ordered vector of page urls in the issue.
|
/// ordered vector of page urls in the issue.
|
||||||
pub pages: Vec<String>,
|
pub pages: Vec<String>,
|
||||||
}
|
}
|
||||||
@ -109,7 +112,10 @@ impl SearchData {
|
|||||||
Self {
|
Self {
|
||||||
query: "".to_string(),
|
query: "".to_string(),
|
||||||
offset,
|
offset,
|
||||||
sort_options: vec![SortOption { field: "media_ts".to_string(), order: "desc".to_string() }],
|
sort_options: vec![SortOption {
|
||||||
|
field: "media_ts".to_string(),
|
||||||
|
order: "desc".to_string(),
|
||||||
|
}],
|
||||||
start_date,
|
start_date,
|
||||||
end_date,
|
end_date,
|
||||||
}
|
}
|
||||||
@ -177,6 +183,16 @@ async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
|
|||||||
Ok(pages)
|
Ok(pages)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Convert the various issue names to their descriptive equivalent.
|
||||||
|
fn convert_issue_name(issue: &str) -> String {
|
||||||
|
match issue {
|
||||||
|
"Zürich" => "01 (Morgenausgabe)".into(),
|
||||||
|
"Ausgabe 02" => "02 (Mittagausgabe)".into(),
|
||||||
|
"Ausgabe 03" => "03 (Abendausgabe)".into(),
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Fetch all page urls for `issues`.
|
/// Fetch all page urls for `issues`.
|
||||||
async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
|
async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
|
||||||
let mut hydrated_issues = Vec::new();
|
let mut hydrated_issues = Vec::new();
|
||||||
@ -197,6 +213,7 @@ async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>
|
|||||||
let pages = build_pages(&cookie, issue.edition_id).await?;
|
let pages = build_pages(&cookie, issue.edition_id).await?;
|
||||||
Ok(Issue {
|
Ok(Issue {
|
||||||
publication_date: issue.publication_date,
|
publication_date: issue.publication_date,
|
||||||
|
edition: convert_issue_name(&issue.issue),
|
||||||
pages,
|
pages,
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
@ -48,8 +48,8 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
|
|||||||
for (object_id, object) in documents_objects.iter() {
|
for (object_id, object) in documents_objects.iter() {
|
||||||
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
|
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
|
||||||
// All other objects should be collected and inserted into the main Document
|
// All other objects should be collected and inserted into the main Document
|
||||||
match object.type_name().unwrap_or("") {
|
match object.type_name().unwrap_or(b"") {
|
||||||
"Catalog" => {
|
b"Catalog" => {
|
||||||
// Collect a first "Catalog" object and use it for the future "Pages"
|
// Collect a first "Catalog" object and use it for the future "Pages"
|
||||||
catalog_object = Some((
|
catalog_object = Some((
|
||||||
if let Some((id, _)) = catalog_object {
|
if let Some((id, _)) = catalog_object {
|
||||||
@ -60,7 +60,7 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
|
|||||||
object.clone(),
|
object.clone(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
"Pages" => {
|
b"Pages" => {
|
||||||
// Collect and update a first "Pages" object and use it for the future "Catalog"
|
// Collect and update a first "Pages" object and use it for the future "Catalog"
|
||||||
// We have also to merge all dictionaries of the old and the new "Pages" object
|
// We have also to merge all dictionaries of the old and the new "Pages" object
|
||||||
if let Ok(dictionary) = object.as_dict() {
|
if let Ok(dictionary) = object.as_dict() {
|
||||||
@ -81,9 +81,9 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"Page" => {} // Ignored, processed later and separately
|
b"Page" => {} // Ignored, processed later and separately
|
||||||
"Outlines" => {} // Ignored, not supported yet
|
b"Outlines" => {} // Ignored, not supported yet
|
||||||
"Outline" => {} // Ignored, not supported yet
|
b"Outline" => {} // Ignored, not supported yet
|
||||||
_ => {
|
_ => {
|
||||||
merged_doc.max_id += 1;
|
merged_doc.max_id += 1;
|
||||||
merged_doc.objects.insert(*object_id, object.clone());
|
merged_doc.objects.insert(*object_id, object.clone());
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
|
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
|
||||||
{
|
{
|
||||||
description = "little-hesinde project";
|
description = "little-hesinde project";
|
||||||
inputs = {
|
inputs = {
|
||||||
@ -21,7 +21,7 @@
|
|||||||
fenixPkgs:
|
fenixPkgs:
|
||||||
fenixPkgs.fromToolchainFile {
|
fenixPkgs.fromToolchainFile {
|
||||||
file = ./rust-toolchain.toml;
|
file = ./rust-toolchain.toml;
|
||||||
sha256 = "sha256-VZZnlyP69+Y3crrLHQyJirqlHrTtGTsyiSnZB8jEvVo=";
|
sha256 = "sha256-AJ6LX/Q/Er9kS15bn9iflkUwcgYqRQxiOIL2ToVAXaU=";
|
||||||
};
|
};
|
||||||
|
|
||||||
buildTargets = {
|
buildTargets = {
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
[toolchain]
|
[toolchain]
|
||||||
channel = "stable"
|
channel = "stable"
|
||||||
targets = [ "i686-unknown-linux-musl", "aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl" ]
|
targets = [ "i686-unknown-linux-musl", "aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl" ]
|
||||||
profile = "minimal"
|
profile = "default"
|
||||||
|
Loading…
Reference in New Issue
Block a user