From 79250a53d3cd90fadccfb52da512e2e1c918c1ca Mon Sep 17 00:00:00 2001 From: Sebastian Hugentobler Date: Sat, 18 Nov 2023 17:26:06 +0100 Subject: [PATCH] scrape image links as well (they do not need auth) --- src/feeds/mod.rs | 13 +++++++++ src/scrapers.rs | 68 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 62 insertions(+), 19 deletions(-) diff --git a/src/feeds/mod.rs b/src/feeds/mod.rs index d58f0f6..0625139 100644 --- a/src/feeds/mod.rs +++ b/src/feeds/mod.rs @@ -34,6 +34,7 @@ pub struct Footprint { pub url: String, pub date: Date, pub page: u8, + pub images: Vec, } const ISO8601_DATE: u128 = iso8601::Config::DEFAULT @@ -54,11 +55,23 @@ impl Footprint { page_url(root_url, self.page), self.page ); + // injection again + let content = self + .images + .iter() + .map(|x| format!("
", x)) + .fold(String::new(), |mut a, b| { + a.reserve(b.len()); + a.push_str(&b); + a + }); + ItemBuilder::default() .title(Some(self.title)) .pub_date(self.date.format(&Iso8601::).ok()) .link(Some(self.url.clone())) .description(Some(desc)) + .content(Some(content)) .guid(Some(GuidBuilder::default().value(self.url).build())) .build() } diff --git a/src/scrapers.rs b/src/scrapers.rs index ed1d8db..32b6a02 100644 --- a/src/scrapers.rs +++ b/src/scrapers.rs @@ -28,8 +28,6 @@ pub async fn fetch_footprints(feed_url: &str) -> Result, ScrapeEr let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?; let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?; let more_selector = Selector::parse("a#footprintListLoadMore")?; - let text_selector = Selector::parse("div.text > p")?; - let text_rest_selector = Selector::parse("div.text > p > span.rest")?; let mut footprints: Vec = Vec::new(); let mut has_more_pages = true; @@ -44,8 +42,6 @@ pub async fn fetch_footprints(feed_url: &str) -> Result, ScrapeEr &x, &footprint_title_selector, &footprint_date_selector, - &text_selector, - &text_rest_selector, page, ) })); @@ -61,26 +57,20 @@ fn parse_footprint( footprint_el: &ElementRef, footprint_title_selector: &Selector, footprint_date_selector: &Selector, - text_selector: &Selector, - text_rest_selector: &Selector, page: u8, ) -> Option { let title_el = footprint_el.select(footprint_title_selector).next()?; - let title = title_el.text().next()?.to_string(); - let url = title_el.value().attr("href")?.to_string(); - let date = footprint_el - .select(footprint_date_selector) - .next()? - .value() - .attr("content")?; + let title = single_text_from_el(&title_el)?; + let url = attr_from_el(&title_el, "href")?; + let date_el = footprint_el.select(footprint_date_selector).next()?; + let date = attr_from_el(&date_el, "content")?; let format = format_description::parse("[year]-[month]-[day]").ok()?; - let date = Date::parse(date, &format).ok()?; + let date = Date::parse(&date, &format).ok()?; - let text = if let Some(text) = footprint_el.select(text_selector).next() { - let text = text.text().next()?.to_string(); - - if let Some(text_rest) = footprint_el.select(text_rest_selector).next() { - format!("{}{}", text, text_rest.text().next()?) + let text = if let Some(text) = single_text_from_selector(footprint_el, "div.text > p") { + if let Some(text_rest) = single_text_from_selector(footprint_el, "div.text > p > span.rest") + { + format!("{}{}", text, text_rest) } else { text } @@ -90,15 +80,55 @@ fn parse_footprint( .trim() .to_string(); + let images = scrape_img_links(&footprint_el).ok()?; + Some(Footprint { title, url, date, text, page, + images, }) } +fn single_text_from_selector(el: &ElementRef, selector: &str) -> Option { + let selector = Selector::parse(selector).ok()?; + let selected = el.select(&selector).next()?; + + single_text_from_el(&selected) +} + +fn single_text_from_el(el: &ElementRef) -> Option { + el.text().next().map(|x| x.to_string()) +} + +fn attr_from_el(el: &ElementRef, attribute: &str) -> Option { + Some(el.value().attr(attribute)?.to_string()) +} + +fn scrape_img_links(footprint_el: &ElementRef) -> Result, ScrapeError> { + let main_img_selector = Selector::parse("div.images-container > a.image")?; + let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?; + + let mut img_urls = Vec::new(); + + if let Some(main_img) = footprint_el.select(&main_img_selector).next() { + if let Some(main_img_url) = attr_from_el(&main_img, "data-url") { + img_urls.push(main_img_url); + } + } + + for img in footprint_el.select(&other_img_selector) { + if let Some(main_img_url) = attr_from_el(&img, "data-url") { + img_urls.push(main_img_url); + } + } + let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect(); + + Ok(img_urls) +} + pub fn page_url(feed_url: &str, page: u8) -> String { let connector = if feed_url.contains('?') { "&" } else { "?" }; format!("{}{}page={}&sort=ASC", feed_url, connector, page)