From 79250a53d3cd90fadccfb52da512e2e1c918c1ca Mon Sep 17 00:00:00 2001
From: Sebastian Hugentobler <sebastian@vanwa.ch>
Date: Sat, 18 Nov 2023 17:26:06 +0100
Subject: [PATCH] scrape image links as well (they do not need auth)

---
 src/feeds/mod.rs | 13 +++++++++
 src/scrapers.rs  | 68 ++++++++++++++++++++++++++++++++++--------------
 2 files changed, 62 insertions(+), 19 deletions(-)
diff --git a/src/feeds/mod.rs b/src/feeds/mod.rs
index d58f0f6..0625139 100644
--- a/src/feeds/mod.rs
+++ b/src/feeds/mod.rs
@@ -34,6 +34,7 @@ pub struct Footprint {
     pub url: String,
     pub date: Date,
     pub page: u8,
+    pub images: Vec<String>,
 }
 
 const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
@@ -54,11 +55,23 @@ impl Footprint {
             page_url(root_url, self.page),
             self.page
         );
+        // injection again
+        let content = self
+            .images
+            .iter()
+            .map(|x| format!("<img src=\"{}\" /><br />", x))
+            .fold(String::new(), |mut a, b| {
+                a.reserve(b.len());
+                a.push_str(&b);
+                a
+            });
+
         ItemBuilder::default()
             .title(Some(self.title))
             .pub_date(self.date.format(&Iso8601::<ISO8601_DATE>).ok())
             .link(Some(self.url.clone()))
             .description(Some(desc))
+            .content(Some(content))
             .guid(Some(GuidBuilder::default().value(self.url).build()))
             .build()
     }
diff --git a/src/scrapers.rs b/src/scrapers.rs
index ed1d8db..32b6a02 100644
--- a/src/scrapers.rs
+++ b/src/scrapers.rs
@@ -28,8 +28,6 @@ pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeEr
     let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?;
     let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?;
     let more_selector = Selector::parse("a#footprintListLoadMore")?;
-    let text_selector = Selector::parse("div.text > p")?;
-    let text_rest_selector = Selector::parse("div.text > p > span.rest")?;
 
     let mut footprints: Vec<Footprint> = Vec::new();
     let mut has_more_pages = true;
@@ -44,8 +42,6 @@ pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeEr
                 &x,
                 &footprint_title_selector,
                 &footprint_date_selector,
-                &text_selector,
-                &text_rest_selector,
                 page,
             )
         }));
@@ -61,26 +57,20 @@ fn parse_footprint(
     footprint_el: &ElementRef,
     footprint_title_selector: &Selector,
     footprint_date_selector: &Selector,
-    text_selector: &Selector,
-    text_rest_selector: &Selector,
     page: u8,
 ) -> Option<Footprint> {
     let title_el = footprint_el.select(footprint_title_selector).next()?;
-    let title = title_el.text().next()?.to_string();
-    let url = title_el.value().attr("href")?.to_string();
-    let date = footprint_el
-        .select(footprint_date_selector)
-        .next()?
-        .value()
-        .attr("content")?;
+    let title = single_text_from_el(&title_el)?;
+    let url = attr_from_el(&title_el, "href")?;
+    let date_el = footprint_el.select(footprint_date_selector).next()?;
+    let date = attr_from_el(&date_el, "content")?;
     let format = format_description::parse("[year]-[month]-[day]").ok()?;
-    let date = Date::parse(date, &format).ok()?;
+    let date = Date::parse(&date, &format).ok()?;
 
-    let text = if let Some(text) = footprint_el.select(text_selector).next() {
-        let text = text.text().next()?.to_string();
-
-        if let Some(text_rest) = footprint_el.select(text_rest_selector).next() {
-            format!("{}{}", text, text_rest.text().next()?)
+    let text = if let Some(text) = single_text_from_selector(footprint_el, "div.text > p") {
+        if let Some(text_rest) = single_text_from_selector(footprint_el, "div.text > p > span.rest")
+        {
+            format!("{}{}", text, text_rest)
         } else {
             text
         }
@@ -90,15 +80,55 @@ fn parse_footprint(
     .trim()
     .to_string();
 
+    let images = scrape_img_links(&footprint_el).ok()?;
+
     Some(Footprint {
         title,
         url,
         date,
         text,
         page,
+        images,
     })
 }
 
+fn single_text_from_selector(el: &ElementRef, selector: &str) -> Option<String> {
+    let selector = Selector::parse(selector).ok()?;
+    let selected = el.select(&selector).next()?;
+
+    single_text_from_el(&selected)
+}
+
+fn single_text_from_el(el: &ElementRef) -> Option<String> {
+    el.text().next().map(|x| x.to_string())
+}
+
+fn attr_from_el(el: &ElementRef, attribute: &str) -> Option<String> {
+    Some(el.value().attr(attribute)?.to_string())
+}
+
+fn scrape_img_links(footprint_el: &ElementRef) -> Result<Vec<String>, ScrapeError> {
+    let main_img_selector = Selector::parse("div.images-container > a.image")?;
+    let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?;
+
+    let mut img_urls = Vec::new();
+
+    if let Some(main_img) = footprint_el.select(&main_img_selector).next() {
+        if let Some(main_img_url) = attr_from_el(&main_img, "data-url") {
+            img_urls.push(main_img_url);
+        }
+    }
+
+    for img in footprint_el.select(&other_img_selector) {
+        if let Some(main_img_url) = attr_from_el(&img, "data-url") {
+            img_urls.push(main_img_url);
+        }
+    }
+    let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect();
+
+    Ok(img_urls)
+}
+
 pub fn page_url(feed_url: &str, page: u8) -> String {
     let connector = if feed_url.contains('?') { "&" } else { "?" };
     format!("{}{}page={}&sort=ASC", feed_url, connector, page)