use crate::feeds::Footprint; use scraper::{selector::CssLocalName, CaseSensitivity, Element, ElementRef, Html, Selector}; use serde::Deserialize; use thiserror::Error; use time::{format_description, Date}; #[derive(Error, Debug)] pub enum ScrapeError { #[error("An error occurred fetching a document.")] FetchError(#[from] reqwest::Error), #[error("An error occurred constructing a selector.")] SelectorError(#[from] scraper::error::SelectorErrorKind<'static>), } #[derive(Clone, Debug, Deserialize)] pub enum Media { Image(String), Video(String), } pub async fn feed_title(feed_url: &str) -> Result { let resp = reqwest::get(feed_url).await?.text().await?; let doc = Html::parse_document(&resp); let title_selector = Selector::parse("span.placeholder")?; let title = doc .select(&title_selector) .find_map(|x| x.text().next()) .unwrap_or("no title"); Ok(title.to_string()) } pub async fn fetch_footprints(feed_url: &str) -> Result, ScrapeError> { let footprint_selector = Selector::parse("li.footprint article.footprint-container")?; let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?; let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?; let more_selector = Selector::parse("a#footprintListLoadMore")?; let mut footprints: Vec = Vec::new(); let mut has_more_pages = true; let mut page = 1u8; while has_more_pages { let feed_url = page_url(feed_url, page); let resp = reqwest::get(feed_url).await?.text().await?; let doc = Html::parse_document(&resp); footprints.extend(doc.select(&footprint_selector).flat_map(|x| { parse_footprint( &x, &footprint_title_selector, &footprint_date_selector, page, ) })); has_more_pages = doc.select(&more_selector).next().is_some(); page += 1; } Ok(footprints) } fn parse_footprint( footprint_el: &ElementRef, footprint_title_selector: &Selector, footprint_date_selector: &Selector, page: u8, ) -> Option { let title_el = footprint_el.select(footprint_title_selector).next()?; let title = single_text_from_el(&title_el)?; let url = attr_from_el(&title_el, "href")?; let date_el = footprint_el.select(footprint_date_selector).next()?; let date = attr_from_el(&date_el, "content")?; let format = format_description::parse("[year]-[month]-[day]").ok()?; let date = Date::parse(&date, &format).ok()?; let text = if let Some(text) = single_text_from_selector(footprint_el, "div.text > p") { if let Some(text_rest) = single_text_from_selector(footprint_el, "div.text > p > span.rest") { format!("{}{}", text, text_rest) } else { text } } else { "".to_string() } .trim() .to_string(); let media = scrape_media_links(&footprint_el).ok()?; Some(Footprint { title, url, date, text, page, media, }) } fn single_text_from_selector(el: &ElementRef, selector: &str) -> Option { let selector = Selector::parse(selector).ok()?; let selected = el.select(&selector).next()?; single_text_from_el(&selected) } fn single_text_from_el(el: &ElementRef) -> Option { el.text().next().map(|x| x.to_string()) } fn attr_from_el(el: &ElementRef, attribute: &str) -> Option { Some(el.value().attr(attribute)?.to_string()) } fn scrape_media_links(footprint_el: &ElementRef) -> Result, ScrapeError> { let main_media_selector = Selector::parse("div.images-container > a.image")?; let other_media_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb.image")?; let mut media = Vec::new(); if let Some(main_media) = footprint_el.select(&main_media_selector).next() { if let Some(main_media) = concrete_media(&main_media) { media.push(main_media); } } for other_media in footprint_el.select(&other_media_selector) { if let Some(other_media) = concrete_media(&other_media) { media.push(other_media); } } Ok(media) } fn concrete_media(media: &ElementRef) -> Option { if let Some(media_url) = attr_from_el(media, "data-url") { let media_url = format!("https:{}", media_url); Some( if media.has_class( &CssLocalName("photo".into()), CaseSensitivity::AsciiCaseInsensitive, ) { Media::Image(media_url) } else { Media::Video(media_url) }, ) } else { None } } pub fn page_url(feed_url: &str, page: u8) -> String { let connector = if feed_url.contains('?') { "&" } else { "?" }; format!("{}{}page={}&sort=ASC", feed_url, connector, page) }