diff --git a/README.md b/README.md index 198e3ad..ab6411a 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,12 @@ Login and use the resulting cookie to download all issues from 2024-06-01 until nzz-cookie -u 'myuser@example.com' Result<()> { + info!("saving issue {}", issue.publication_date); + + let client = reqwest::Client::new(); + let tmp_dir = tempfile::tempdir()?; + let mut pages = Vec::new(); + for (i, page) in issue.pages.iter().enumerate() { + debug!( + "fetching issue {}, page {}: {page}", + issue.publication_date, + i + 1 + ); + + let response = client.get(page).send().await?; + let mut content = Cursor::new(response.bytes().await?); + let mut page_data = Vec::new(); + content.read_to_end(&mut page_data)?; + + let tmp_page = tmp_dir.path().join(i.to_string()); + fs::write(&tmp_page, page_data)?; + pages.push(tmp_page); + } + + let issue_name = format!("nzz_{}.pdf", issue.publication_date); + let issue_path = output_dir.join(issue_name); + let issue_title = format!("NZZ {}", issue.publication_date); + + pdf::merge(pages, &issue_path, &issue_title)?; + debug!("issue {} saved", issue.publication_date); + + Ok(()) +} + /// Download all pages of the provided `issues` and save them merged to the directory `output_dir`. /// /// Create `output_dir` if it does not exist. @@ -18,34 +56,23 @@ pub async fn fetch(issues: Vec, output_dir: &Path) -> Result<()> { debug!("ensuring {output_dir:?} exists"); fs::create_dir_all(output_dir)?; + let permits = Arc::new(Semaphore::new(MAX_DOWNLOADS)); + + let mut jobs = JoinSet::new(); for issue in issues { - info!("saving issue {}", issue.publication_date); + let permits = permits.clone(); + let output_dir = output_dir.to_path_buf().clone(); - let tmp_dir = tempfile::tempdir()?; - let mut pages = Vec::new(); - for (i, page) in issue.pages.into_iter().enumerate() { - debug!( - "fetching issue {}, page {}: {page}", - issue.publication_date, - i + 1 - ); + let job: tokio::task::JoinHandle> = spawn(async move { + let _permit = permits.acquire().await.unwrap(); + fetch_issue(&issue, output_dir).await?; + Ok(()) + }); + jobs.spawn(job); + } - let response = reqwest::Client::new().get(page).send().await?; - let mut content = Cursor::new(response.bytes().await?); - let mut page_data = Vec::new(); - content.read_to_end(&mut page_data)?; - - let tmp_page = tmp_dir.path().join(i.to_string()); - fs::write(&tmp_page, page_data)?; - pages.push(tmp_page); - } - - let issue_name = format!("nzz_{}.pdf", issue.publication_date); - let issue_path = output_dir.join(issue_name); - let issue_title = format!("NZZ {}", issue.publication_date); - - pdf::merge(pages, &issue_path, &issue_title)?; - debug!("issue {} saved", issue.publication_date); + while let Some(res) = jobs.join_next().await { + res???; } Ok(()) diff --git a/download/src/nzz.rs b/download/src/nzz.rs index 4b1f88a..a2911b4 100644 --- a/download/src/nzz.rs +++ b/download/src/nzz.rs @@ -1,12 +1,16 @@ //! Handle information relating to NZZ issues. +use std::sync::Arc; + use anyhow::Result; use serde::{Deserialize, Serialize}; use time::Date; +use tokio::{spawn, sync::Semaphore, task::JoinSet}; use tracing::info; const SEARCH_URL: &str = "https://zeitungsarchiv.nzz.ch/solr-epaper-search/1.0/search"; const ISSUE_URL: &str = "https://zeitungsarchiv.nzz.ch/archive/1.0/getPages"; +const MAX_DOWNLOADS: usize = 4; #[derive(Debug, Serialize, Deserialize)] struct SearchData { @@ -173,16 +177,32 @@ async fn build_pages(cookie: &str, edition_id: u32) -> Result> { /// Fetch all page urls for `issues`. async fn build_issues(cookie: &str, issues: Vec) -> Result> { let mut hydrated_issues = Vec::new(); + let permits = Arc::new(Semaphore::new(MAX_DOWNLOADS)); + + let mut jobs = JoinSet::new(); for issue in issues { - info!( - "fetching page information for issue {}", - issue.publication_date - ); - let pages = build_pages(cookie, issue.edition_id).await?; - hydrated_issues.push(Issue { - publication_date: issue.publication_date, - pages, + let permits = permits.clone(); + let cookie = cookie.to_string(); + + let job: tokio::task::JoinHandle> = spawn(async move { + let _permit = permits.acquire().await.unwrap(); + + info!( + "fetching page information for issue {}", + issue.publication_date + ); + let pages = build_pages(&cookie, issue.edition_id).await?; + Ok(Issue { + publication_date: issue.publication_date, + pages, + }) }); + jobs.spawn(job); + } + + while let Some(res) = jobs.join_next().await { + let issue: Issue = res???; + hydrated_issues.push(issue); } Ok(hydrated_issues)