use crate::feeds::Footprint; use scraper::{ElementRef, Html, Selector}; use thiserror::Error; use time::{format_description, Date}; #[derive(Error, Debug)] pub enum ScrapeError { #[error("An error occurred fetching a document.")] FetchError(#[from] reqwest::Error), #[error("An error occurred constructing a selector.")] SelectorError(#[from] scraper::error::SelectorErrorKind<'static>), } pub async fn feed_title(feed_url: &str) -> Result { let resp = reqwest::get(feed_url).await?.text().await?; let doc = Html::parse_document(&resp); let title_selector = Selector::parse("span.placeholder")?; let title = doc .select(&title_selector) .find_map(|x| x.text().next()) .unwrap_or("no title"); Ok(title.to_string()) } pub async fn fetch_footprints(feed_url: &str) -> Result, ScrapeError> { let footprint_selector = Selector::parse("li.footprint div.footprint-container")?; let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?; let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?; let more_selector = Selector::parse("a#footprintListLoadMore")?; let text_selector = Selector::parse("div.text > p")?; let text_rest_selector = Selector::parse("div.text > p > span.rest")?; let mut footprints: Vec = Vec::new(); let mut has_more_pages = true; let mut page = 1u8; while has_more_pages { let feed_url = page_url(feed_url, page); let resp = reqwest::get(feed_url).await?.text().await?; let doc = Html::parse_document(&resp); footprints.extend(doc.select(&footprint_selector).flat_map(|x| { parse_footprint( &x, &footprint_title_selector, &footprint_date_selector, &text_selector, &text_rest_selector, page, ) })); has_more_pages = doc.select(&more_selector).next().is_some(); page += 1; } Ok(footprints) } fn parse_footprint( footprint_el: &ElementRef, footprint_title_selector: &Selector, footprint_date_selector: &Selector, text_selector: &Selector, text_rest_selector: &Selector, page: u8, ) -> Option { let title_el = footprint_el.select(footprint_title_selector).next()?; let title = title_el.text().next()?.to_string(); let url = title_el.value().attr("href")?.to_string(); let date = footprint_el .select(footprint_date_selector) .next()? .value() .attr("content")?; let format = format_description::parse("[year]-[month]-[day]").ok()?; let date = Date::parse(date, &format).ok()?; let text = footprint_el .select(text_selector) .next()? .text() .next()? .to_string(); let text = if let Some(text_rest) = footprint_el.select(text_rest_selector).next() { format!("{}{}", text, text_rest.text().next()?) } else { text } .trim() .to_string(); Some(Footprint { title, url, date, text, page, }) } pub fn page_url(feed_url: &str, page: u8) -> String { let connector = if feed_url.contains('?') { "&" } else { "?" }; format!("{}{}page={}&sort=ASC", feed_url, connector, page) }