161 lines
4.9 KiB
Rust
161 lines
4.9 KiB
Rust
use crate::feeds::Footprint;
|
|
use scraper::{selector::CssLocalName, CaseSensitivity, Element, ElementRef, Html, Selector};
|
|
use serde::Deserialize;
|
|
use thiserror::Error;
|
|
use time::{format_description, Date};
|
|
|
|
#[derive(Error, Debug)]
|
|
pub enum ScrapeError {
|
|
#[error("An error occurred fetching a document.")]
|
|
FetchError(#[from] reqwest::Error),
|
|
#[error("An error occurred constructing a selector.")]
|
|
SelectorError(#[from] scraper::error::SelectorErrorKind<'static>),
|
|
}
|
|
|
|
#[derive(Clone, Debug, Deserialize)]
|
|
pub enum Media {
|
|
Image(String),
|
|
Video(String),
|
|
}
|
|
|
|
pub async fn feed_title(feed_url: &str) -> Result<String, ScrapeError> {
|
|
let resp = reqwest::get(feed_url).await?.text().await?;
|
|
let doc = Html::parse_document(&resp);
|
|
let title_selector = Selector::parse("span.placeholder")?;
|
|
let title = doc
|
|
.select(&title_selector)
|
|
.find_map(|x| x.text().next())
|
|
.unwrap_or("no title");
|
|
|
|
Ok(title.to_string())
|
|
}
|
|
|
|
pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeError> {
|
|
let footprint_selector = Selector::parse("li.footprint article.footprint-container")?;
|
|
let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?;
|
|
let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?;
|
|
let more_selector = Selector::parse("a#footprintListLoadMore")?;
|
|
|
|
let mut footprints: Vec<Footprint> = Vec::new();
|
|
let mut has_more_pages = true;
|
|
let mut page = 1u8;
|
|
while has_more_pages {
|
|
let feed_url = page_url(feed_url, page);
|
|
let resp = reqwest::get(feed_url).await?.text().await?;
|
|
let doc = Html::parse_document(&resp);
|
|
|
|
footprints.extend(doc.select(&footprint_selector).flat_map(|x| {
|
|
parse_footprint(
|
|
&x,
|
|
&footprint_title_selector,
|
|
&footprint_date_selector,
|
|
page,
|
|
)
|
|
}));
|
|
|
|
has_more_pages = doc.select(&more_selector).next().is_some();
|
|
page += 1;
|
|
}
|
|
|
|
Ok(footprints)
|
|
}
|
|
|
|
fn parse_footprint(
|
|
footprint_el: &ElementRef,
|
|
footprint_title_selector: &Selector,
|
|
footprint_date_selector: &Selector,
|
|
page: u8,
|
|
) -> Option<Footprint> {
|
|
let title_el = footprint_el.select(footprint_title_selector).next()?;
|
|
let title = single_text_from_el(&title_el)?;
|
|
let url = attr_from_el(&title_el, "href")?;
|
|
let date_el = footprint_el.select(footprint_date_selector).next()?;
|
|
let date = attr_from_el(&date_el, "content")?;
|
|
let format = format_description::parse("[year]-[month]-[day]").ok()?;
|
|
let date = Date::parse(&date, &format).ok()?;
|
|
|
|
let text = if let Some(text) = single_text_from_selector(footprint_el, "div.text > p") {
|
|
if let Some(text_rest) = single_text_from_selector(footprint_el, "div.text > p > span.rest")
|
|
{
|
|
format!("{}{}", text, text_rest)
|
|
} else {
|
|
text
|
|
}
|
|
} else {
|
|
"".to_string()
|
|
}
|
|
.trim()
|
|
.to_string();
|
|
|
|
let media = scrape_media_links(&footprint_el).ok()?;
|
|
|
|
Some(Footprint {
|
|
title,
|
|
url,
|
|
date,
|
|
text,
|
|
page,
|
|
media,
|
|
})
|
|
}
|
|
|
|
fn single_text_from_selector(el: &ElementRef, selector: &str) -> Option<String> {
|
|
let selector = Selector::parse(selector).ok()?;
|
|
let selected = el.select(&selector).next()?;
|
|
|
|
single_text_from_el(&selected)
|
|
}
|
|
|
|
fn single_text_from_el(el: &ElementRef) -> Option<String> {
|
|
el.text().next().map(|x| x.to_string())
|
|
}
|
|
|
|
fn attr_from_el(el: &ElementRef, attribute: &str) -> Option<String> {
|
|
Some(el.value().attr(attribute)?.to_string())
|
|
}
|
|
|
|
fn scrape_media_links(footprint_el: &ElementRef) -> Result<Vec<Media>, ScrapeError> {
|
|
let main_media_selector = Selector::parse("div.images-container > a.image")?;
|
|
let other_media_selector =
|
|
Selector::parse("div.images-container > div.thumbs > a.image.thumb.image")?;
|
|
|
|
let mut media = Vec::new();
|
|
|
|
if let Some(main_media) = footprint_el.select(&main_media_selector).next() {
|
|
if let Some(main_media) = concrete_media(&main_media) {
|
|
media.push(main_media);
|
|
}
|
|
}
|
|
|
|
for other_media in footprint_el.select(&other_media_selector) {
|
|
if let Some(other_media) = concrete_media(&other_media) {
|
|
media.push(other_media);
|
|
}
|
|
}
|
|
|
|
Ok(media)
|
|
}
|
|
|
|
fn concrete_media(media: &ElementRef) -> Option<Media> {
|
|
if let Some(media_url) = attr_from_el(media, "data-url") {
|
|
let media_url = format!("https:{}", media_url);
|
|
Some(
|
|
if media.has_class(
|
|
&CssLocalName("photo".into()),
|
|
CaseSensitivity::AsciiCaseInsensitive,
|
|
) {
|
|
Media::Image(media_url)
|
|
} else {
|
|
Media::Video(media_url)
|
|
},
|
|
)
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
pub fn page_url(feed_url: &str, page: u8) -> String {
|
|
let connector = if feed_url.contains('?') { "&" } else { "?" };
|
|
format!("{}{}page={}&sort=ASC", feed_url, connector, page)
|
|
}
|