initial commit
This commit is contained in:
commit
2a9f427bc7
21 changed files with 3692 additions and 0 deletions
105
src/scrapers.rs
Normal file
105
src/scrapers.rs
Normal file
|
@ -0,0 +1,105 @@
|
|||
use crate::feeds::Footprint;
|
||||
use scraper::{ElementRef, Html, Selector};
|
||||
use thiserror::Error;
|
||||
use time::{format_description, Date};
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ScrapeError {
|
||||
#[error("An error occurred fetching a document.")]
|
||||
FetchError(#[from] reqwest::Error),
|
||||
#[error("An error occurred constructing a selector.")]
|
||||
SelectorError(#[from] scraper::error::SelectorErrorKind<'static>),
|
||||
}
|
||||
|
||||
pub async fn feed_title(feed_url: &str) -> Result<String, ScrapeError> {
|
||||
let resp = reqwest::get(feed_url).await?.text().await?;
|
||||
let doc = Html::parse_document(&resp);
|
||||
let title_selector = Selector::parse("span.placeholder")?;
|
||||
let title = doc
|
||||
.select(&title_selector)
|
||||
.find_map(|x| x.text().next())
|
||||
.unwrap_or("no title");
|
||||
|
||||
Ok(title.to_string())
|
||||
}
|
||||
|
||||
pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeError> {
|
||||
let footprint_selector = Selector::parse("li.footprint div.footprint-container")?;
|
||||
let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?;
|
||||
let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?;
|
||||
let more_selector = Selector::parse("a#footprintListLoadMore")?;
|
||||
let text_selector = Selector::parse("div.text > p")?;
|
||||
let text_rest_selector = Selector::parse("div.text > p > span.rest")?;
|
||||
|
||||
let mut footprints: Vec<Footprint> = Vec::new();
|
||||
let mut has_more_pages = true;
|
||||
let mut page = 1u8;
|
||||
while has_more_pages {
|
||||
let feed_url = page_url(feed_url, page);
|
||||
let resp = reqwest::get(feed_url).await?.text().await?;
|
||||
let doc = Html::parse_document(&resp);
|
||||
|
||||
footprints.extend(doc.select(&footprint_selector).flat_map(|x| {
|
||||
parse_footprint(
|
||||
&x,
|
||||
&footprint_title_selector,
|
||||
&footprint_date_selector,
|
||||
&text_selector,
|
||||
&text_rest_selector,
|
||||
page,
|
||||
)
|
||||
}));
|
||||
|
||||
has_more_pages = doc.select(&more_selector).next().is_some();
|
||||
page += 1;
|
||||
}
|
||||
|
||||
Ok(footprints)
|
||||
}
|
||||
|
||||
fn parse_footprint(
|
||||
footprint_el: &ElementRef,
|
||||
footprint_title_selector: &Selector,
|
||||
footprint_date_selector: &Selector,
|
||||
text_selector: &Selector,
|
||||
text_rest_selector: &Selector,
|
||||
page: u8,
|
||||
) -> Option<Footprint> {
|
||||
let title_el = footprint_el.select(footprint_title_selector).next()?;
|
||||
let title = title_el.text().next()?.to_string();
|
||||
let url = title_el.value().attr("href")?.to_string();
|
||||
let date = footprint_el
|
||||
.select(footprint_date_selector)
|
||||
.next()?
|
||||
.value()
|
||||
.attr("content")?;
|
||||
let format = format_description::parse("[year]-[month]-[day]").ok()?;
|
||||
let date = Date::parse(date, &format).ok()?;
|
||||
let text = footprint_el
|
||||
.select(text_selector)
|
||||
.next()?
|
||||
.text()
|
||||
.next()?
|
||||
.to_string();
|
||||
|
||||
let text = if let Some(text_rest) = footprint_el.select(text_rest_selector).next() {
|
||||
format!("{}{}", text, text_rest.text().next()?)
|
||||
} else {
|
||||
text
|
||||
}
|
||||
.trim()
|
||||
.to_string();
|
||||
|
||||
Some(Footprint {
|
||||
title,
|
||||
url,
|
||||
date,
|
||||
text,
|
||||
page,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn page_url(feed_url: &str, page: u8) -> String {
|
||||
let connector = if feed_url.contains('?') { "&" } else { "?" };
|
||||
format!("{}{}page={}&sort=ASC", feed_url, connector, page)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue