Compare commits

..

No commits in common. "main" and "direct-url" have entirely different histories.

13 changed files with 348 additions and 467 deletions

1
.envrc
View File

@ -1 +0,0 @@
use flake

1
.gitignore vendored
View File

@ -5,4 +5,3 @@ node_modules
/target
nzz/
result
.direnv/

704
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -5,10 +5,10 @@ members = [
]
[workspace.dependencies]
anyhow = "1.0.96"
clap = { version = "4.5.30", features = ["env", "derive"] }
anyhow = "1.0.86"
clap = { version = "4.5.8", features = ["env", "derive"] }
serde_json = "1.0.120"
tokio = { version = "1.43.0", features = ["full"] }
tokio = { version = "1", features = ["full"] }
[workspace.package]
license = "AGPL-3.0"

View File

@ -76,9 +76,9 @@ nzz-cookie -u 'myuser@example.com' <pw | nzz-download -f 2024-06-01 -t 2024-06-0
## Caveats
There are no retries on a failed downloads so far, it just crashes. Stemming
from that I would advise not to try and download big ranges at once until that
is fixed.
There are no retries on a failed download so far, it just crashes. Stemming from
that I would advise not to try and download big ranges at once until that is
fixed.
## License

View File

@ -1,6 +1,6 @@
[package]
name = "nzz-cookie"
version = "0.3.0"
version = "0.1.0"
edition = "2021"
license = { workspace = true }
authors = { workspace = true }
@ -9,7 +9,7 @@ repository = { workspace = true }
[dependencies]
anyhow = { workspace = true }
clap = { workspace = true }
fantoccini = "0.21.4"
fantoccini = "0.19.3"
serde_json = { workspace = true }
tokio = { workspace = true }

View File

@ -30,20 +30,6 @@ pub async fn run(args: Config, pw: &str) -> Result<()> {
.await?;
client.goto(LOGIN_URL).await?;
sleep(Duration::from_millis(500)).await;
// Sometimes cookie notice has to bee accepted, not sure when though
if client.find(Locator::Css(".cmpboxbtncustom")).await.is_ok() {
let fu_open_button: Element = element_from_css(&client, ".cmpboxbtncustom").await?;
fu_open_button.click().await?;
let fu_button: Element = element_from_css(&client, ".cmpboxbtn.cmpboxbtnyes.cmpboxbtnyescustomchoices.cmptxt_btn_save").await?;
sleep(Duration::from_millis(500)).await;
fu_button.click().await?;
}
let login_button: Element = element_from_css(&client, ".fup-menu-login-container").await?;
sleep(Duration::from_millis(500)).await;
login_button.click().await?;

View File

@ -1,6 +1,6 @@
[package]
name = "nzz-download"
version = "0.3.0"
version = "0.1.0"
edition = "2021"
license = { workspace = true }
authors = { workspace = true }
@ -9,12 +9,12 @@ repository = { workspace = true }
[dependencies]
anyhow = { workspace = true }
clap = { workspace = true }
lopdf = "0.35.0"
reqwest = { version = "0.12.12", features = ["json"] }
serde = { version = "1.0.218", features = ["derive"] }
lopdf = "0.32.0"
reqwest = { version = "0.12.5", features = ["json"] }
serde = { version = "1.0.203", features = ["derive"] }
serde_json = { workspace = true }
tempfile = "3.17.1"
tempfile = "3.10.1"
tokio = { workspace = true }
time = { version = "0.3.37", features = ["macros", "serde", "formatting", "parsing" ] }
tracing = "0.1.41"
tracing-subscriber = "0.3.19"
time = { version = "0.3.36", features = ["macros", "serde", "formatting", "parsing" ] }
tracing = "0.1.40"
tracing-subscriber = "0.3.18"

View File

@ -17,10 +17,7 @@ const MAX_DOWNLOADS: usize = 4;
/// Fetch a single newspaper issue and save the merged pdf to `output_dir`.
async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
info!(
"saving issue {} ({})",
issue.publication_date, issue.edition
);
info!("saving issue {}", issue.publication_date);
let client = reqwest::Client::new();
let tmp_dir = tempfile::tempdir()?;
@ -42,13 +39,9 @@ async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
pages.push(tmp_page);
}
let issue_name = format!(
"nzz_{}_{}.pdf",
issue.publication_date,
issue.edition.to_lowercase()
);
let issue_name = format!("nzz_{}.pdf", issue.publication_date);
let issue_path = output_dir.join(issue_name);
let issue_title = format!("NZZ {} ({})", issue.publication_date, issue.edition);
let issue_title = format!("NZZ {}", issue.publication_date);
pdf::merge(pages, &issue_path, &issue_title)?;
debug!("issue {} saved", issue.publication_date);

View File

@ -12,16 +12,14 @@ const SEARCH_URL: &str = "https://zeitungsarchiv.nzz.ch/solr-epaper-search/1.0/s
const ISSUE_URL: &str = "https://zeitungsarchiv.nzz.ch/archive/1.0/getPages";
const MAX_DOWNLOADS: usize = 4;
#[derive(Debug, Serialize, Deserialize)]
struct SortOption {
field: String,
order: String,
}
#[derive(Debug, Serialize, Deserialize)]
struct SearchData {
query: String,
offset: u32,
#[serde(rename = "sortField")]
sort_field: String,
#[serde(rename = "sortOrder")]
sort_order: String,
#[serde(
rename = "startDate",
serialize_with = "crate::date::serialize",
@ -34,8 +32,6 @@ struct SearchData {
deserialize_with = "crate::date::deserialize"
)]
end_date: Date,
#[serde(rename = "sortOptions")]
sort_options: Vec<SortOption>,
}
#[derive(Debug, Serialize, Deserialize)]
@ -64,7 +60,6 @@ struct IssueData {
deserialize_with = "crate::date::deserialize"
)]
publication_date: Date,
issue: String,
}
#[derive(Debug, Serialize, Deserialize)]
@ -101,8 +96,6 @@ struct PageHighRes {
pub struct Issue {
/// Date of publication.
pub publication_date: Date,
/// Edition name (used to have multiple issues on the same date).
pub edition: String,
/// ordered vector of page urls in the issue.
pub pages: Vec<String>,
}
@ -112,10 +105,8 @@ impl SearchData {
Self {
query: "".to_string(),
offset,
sort_options: vec![SortOption {
field: "media_ts".to_string(),
order: "desc".to_string(),
}],
sort_field: "media_ts".to_string(),
sort_order: "desc".to_string(),
start_date,
end_date,
}
@ -183,16 +174,6 @@ async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
Ok(pages)
}
/// Convert the various issue names to their descriptive equivalent.
fn convert_issue_name(issue: &str) -> String {
match issue {
"Zürich" => "01 (Morgenausgabe)".into(),
"Ausgabe 02" => "02 (Mittagausgabe)".into(),
"Ausgabe 03" => "03 (Abendausgabe)".into(),
_ => unreachable!(),
}
}
/// Fetch all page urls for `issues`.
async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
let mut hydrated_issues = Vec::new();
@ -213,7 +194,6 @@ async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>
let pages = build_pages(&cookie, issue.edition_id).await?;
Ok(Issue {
publication_date: issue.publication_date,
edition: convert_issue_name(&issue.issue),
pages,
})
});

View File

@ -48,8 +48,8 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
for (object_id, object) in documents_objects.iter() {
// We have to ignore "Page" (as are processed later), "Outlines" and "Outline" objects
// All other objects should be collected and inserted into the main Document
match object.type_name().unwrap_or(b"") {
b"Catalog" => {
match object.type_name().unwrap_or("") {
"Catalog" => {
// Collect a first "Catalog" object and use it for the future "Pages"
catalog_object = Some((
if let Some((id, _)) = catalog_object {
@ -60,7 +60,7 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
object.clone(),
));
}
b"Pages" => {
"Pages" => {
// Collect and update a first "Pages" object and use it for the future "Catalog"
// We have also to merge all dictionaries of the old and the new "Pages" object
if let Ok(dictionary) = object.as_dict() {
@ -81,9 +81,9 @@ pub fn merge(input: Vec<PathBuf>, out: &Path, title: &str) -> Result<()> {
));
}
}
b"Page" => {} // Ignored, processed later and separately
b"Outlines" => {} // Ignored, not supported yet
b"Outline" => {} // Ignored, not supported yet
"Page" => {} // Ignored, processed later and separately
"Outlines" => {} // Ignored, not supported yet
"Outline" => {} // Ignored, not supported yet
_ => {
merged_doc.max_id += 1;
merged_doc.objects.insert(*object_id, object.clone());

View File

@ -1,4 +1,4 @@
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
# thanks to https://code.betamike.com/micropelago/domani for the flake, I still do not completely understand it :)
{
description = "little-hesinde project";
inputs = {
@ -21,7 +21,7 @@
fenixPkgs:
fenixPkgs.fromToolchainFile {
file = ./rust-toolchain.toml;
sha256 = "sha256-AJ6LX/Q/Er9kS15bn9iflkUwcgYqRQxiOIL2ToVAXaU=";
sha256 = "sha256-Ngiz76YP4HTY75GGdH2P+APE/DEIx2R/Dn+BwwOyzZU=";
};
buildTargets = {

View File

@ -1,4 +1,4 @@
[toolchain]
channel = "stable"
targets = [ "i686-unknown-linux-musl", "aarch64-unknown-linux-musl", "x86_64-unknown-linux-musl" ]
profile = "default"
profile = "minimal"