run downloads in parallel

2024-07-03 13:43:08 +02:00 · 2024-07-03 13:43:08 +02:00 · f7c1408cef
commit f7c1408cef
parent cf8fbe0965
3 changed files with 88 additions and 35 deletions
--- a/README.md
+++ b/README.md
@ -74,6 +74,12 @@ Login and use the resulting cookie to download all issues from 2024-06-01 until
 nzz-cookie -u 'myuser@example.com' <pw | nzz-download -f 2024-06-01 -t 2024-06-05
 ```

+## Caveats
+
+There are no retries on a failed download so far, it just crashes. Stemming from
+that I would advise not to try and download big ranges at once until that is
+fixed.
+
 ## License

 Licensed as [AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.html).
--- a/download/src/download.rs
+++ b/download/src/download.rs
@ -1,16 +1,54 @@
 //! Handle downloads of newspaper issues.

 use std::{
-    fs::{self},
+    fs,
    io::{Cursor, Read},
-    path::Path,
+    path::{Path, PathBuf},
+    sync::Arc,
 };

 use anyhow::Result;
+use tokio::{spawn, sync::Semaphore, task::JoinSet};
 use tracing::{debug, info};

 use crate::{nzz::Issue, pdf};

+const MAX_DOWNLOADS: usize = 4;
+
+/// Fetch a single newspaper issue and save the merged pdf to `output_dir`.
+async fn fetch_issue(issue: &Issue, output_dir: PathBuf) -> Result<()> {
+    info!("saving issue {}", issue.publication_date);
+
+    let client = reqwest::Client::new();
+    let tmp_dir = tempfile::tempdir()?;
+    let mut pages = Vec::new();
+    for (i, page) in issue.pages.iter().enumerate() {
+        debug!(
+            "fetching issue {}, page {}: {page}",
+            issue.publication_date,
+            i + 1
+        );
+
+        let response = client.get(page).send().await?;
+        let mut content = Cursor::new(response.bytes().await?);
+        let mut page_data = Vec::new();
+        content.read_to_end(&mut page_data)?;
+
+        let tmp_page = tmp_dir.path().join(i.to_string());
+        fs::write(&tmp_page, page_data)?;
+        pages.push(tmp_page);
+    }
+
+    let issue_name = format!("nzz_{}.pdf", issue.publication_date);
+    let issue_path = output_dir.join(issue_name);
+    let issue_title = format!("NZZ {}", issue.publication_date);
+
+    pdf::merge(pages, &issue_path, &issue_title)?;
+    debug!("issue {} saved", issue.publication_date);
+
+    Ok(())
+}
+
 /// Download all pages of the provided `issues` and save them merged to the directory `output_dir`.
 ///
 /// Create `output_dir` if it does not exist.
@ -18,34 +56,23 @@ pub async fn fetch(issues: Vec<Issue>, output_dir: &Path) -> Result<()> {
    debug!("ensuring {output_dir:?} exists");
    fs::create_dir_all(output_dir)?;

+    let permits = Arc::new(Semaphore::new(MAX_DOWNLOADS));
+
+    let mut jobs = JoinSet::new();
    for issue in issues {
-        info!("saving issue {}", issue.publication_date);
+        let permits = permits.clone();
+        let output_dir = output_dir.to_path_buf().clone();

-        let tmp_dir = tempfile::tempdir()?;
-        let mut pages = Vec::new();
-        for (i, page) in issue.pages.into_iter().enumerate() {
-            debug!(
-                "fetching issue {}, page {}: {page}",
-                issue.publication_date,
-                i + 1
-            );
+        let job: tokio::task::JoinHandle<Result<()>> = spawn(async move {
+            let _permit = permits.acquire().await.unwrap();
+            fetch_issue(&issue, output_dir).await?;
+            Ok(())
+        });
+        jobs.spawn(job);
+    }

-            let response = reqwest::Client::new().get(page).send().await?;
-            let mut content = Cursor::new(response.bytes().await?);
-            let mut page_data = Vec::new();
-            content.read_to_end(&mut page_data)?;
-
-            let tmp_page = tmp_dir.path().join(i.to_string());
-            fs::write(&tmp_page, page_data)?;
-            pages.push(tmp_page);
-        }
-
-        let issue_name = format!("nzz_{}.pdf", issue.publication_date);
-        let issue_path = output_dir.join(issue_name);
-        let issue_title = format!("NZZ {}", issue.publication_date);
-
-        pdf::merge(pages, &issue_path, &issue_title)?;
-        debug!("issue {} saved", issue.publication_date);
+    while let Some(res) = jobs.join_next().await {
+        res???;
    }

    Ok(())
--- a/download/src/nzz.rs
+++ b/download/src/nzz.rs
@ -1,12 +1,16 @@
 //! Handle information relating to NZZ issues.

+use std::sync::Arc;
+
 use anyhow::Result;
 use serde::{Deserialize, Serialize};
 use time::Date;
+use tokio::{spawn, sync::Semaphore, task::JoinSet};
 use tracing::info;

 const SEARCH_URL: &str = "https://zeitungsarchiv.nzz.ch/solr-epaper-search/1.0/search";
 const ISSUE_URL: &str = "https://zeitungsarchiv.nzz.ch/archive/1.0/getPages";
+const MAX_DOWNLOADS: usize = 4;

 #[derive(Debug, Serialize, Deserialize)]
 struct SearchData {
@ -173,16 +177,32 @@ async fn build_pages(cookie: &str, edition_id: u32) -> Result<Vec<String>> {
 /// Fetch all page urls for `issues`.
 async fn build_issues(cookie: &str, issues: Vec<IssueData>) -> Result<Vec<Issue>> {
    let mut hydrated_issues = Vec::new();
+    let permits = Arc::new(Semaphore::new(MAX_DOWNLOADS));
+
+    let mut jobs = JoinSet::new();
    for issue in issues {
-        info!(
-            "fetching page information for issue {}",
-            issue.publication_date
-        );
-        let pages = build_pages(cookie, issue.edition_id).await?;
-        hydrated_issues.push(Issue {
-            publication_date: issue.publication_date,
-            pages,
+        let permits = permits.clone();
+        let cookie = cookie.to_string();
+
+        let job: tokio::task::JoinHandle<Result<Issue>> = spawn(async move {
+            let _permit = permits.acquire().await.unwrap();
+
+            info!(
+                "fetching page information for issue {}",
+                issue.publication_date
+            );
+            let pages = build_pages(&cookie, issue.edition_id).await?;
+            Ok(Issue {
+                publication_date: issue.publication_date,
+                pages,
+            })
        });
+        jobs.spawn(job);
+    }
+
+    while let Some(res) = jobs.join_next().await {
+        let issue: Issue = res???;
+        hydrated_issues.push(issue);
    }

    Ok(hydrated_issues)