From 951cd9042f7eff3f6621050e3877179eeb478989 Mon Sep 17 00:00:00 2001 From: Sebastian Hugentobler Date: Sun, 7 Jan 2024 14:35:31 +0100 Subject: [PATCH] correctly recognize videos --- Cargo.lock | 2 +- Cargo.toml | 2 +- Containerfile | 2 +- src/feeds/mod.rs | 6 ++--- src/feeds/template.rs | 4 ++- src/scrapers.rs | 55 ++++++++++++++++++++++++++++----------- templates/feed_entry.html | 11 +++++--- 7 files changed, 56 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 578e573..93fbc79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -516,7 +516,7 @@ dependencies = [ [[package]] name = "findpenguins-feed" -version = "0.3.0" +version = "0.4.0" dependencies = [ "askama", "axum", diff --git a/Cargo.toml b/Cargo.toml index e2b08a6..3eb0596 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "findpenguins-feed" -version = "0.3.0" +version = "0.4.0" edition = "2021" authors = ["Sebastian Hugentobler "] license = "AGPL-3.0-or-later" diff --git a/Containerfile b/Containerfile index e6186da..4647635 100644 --- a/Containerfile +++ b/Containerfile @@ -1,4 +1,4 @@ -FROM docker.io/rust:1-alpine3.18 AS builder +FROM docker.io/rust:1-alpine3.19 AS builder RUN apk --no-cache add musl-dev diff --git a/src/feeds/mod.rs b/src/feeds/mod.rs index bc30452..1c6a0e7 100644 --- a/src/feeds/mod.rs +++ b/src/feeds/mod.rs @@ -9,7 +9,7 @@ use time::format_description::well_known::iso8601::FormattedComponents; use time::format_description::well_known::{iso8601, Iso8601}; use time::Date; -use crate::scrapers::page_url; +use crate::scrapers::{page_url, Media}; use crate::{hash, scrapers}; use self::template::FeedEntryTemplate; @@ -37,7 +37,7 @@ pub struct Footprint { pub url: String, pub date: Date, pub page: u8, - pub images: Vec, + pub media: Vec, } const ISO8601_DATE: u128 = iso8601::Config::DEFAULT @@ -58,7 +58,7 @@ impl Footprint { text: text.to_string(), page_url, page: self.page, - images: self.images, + media: self.media, }; let content = template.render().ok(); diff --git a/src/feeds/template.rs b/src/feeds/template.rs index 8a61300..4848edf 100644 --- a/src/feeds/template.rs +++ b/src/feeds/template.rs @@ -1,5 +1,7 @@ use askama::Template; +use crate::scrapers::Media; + use super::Feed; #[derive(Template)] @@ -14,5 +16,5 @@ pub struct FeedEntryTemplate { pub text: String, pub page_url: String, pub page: u8, - pub images: Vec, + pub media: Vec, } diff --git a/src/scrapers.rs b/src/scrapers.rs index 32b6a02..91f5fbe 100644 --- a/src/scrapers.rs +++ b/src/scrapers.rs @@ -1,5 +1,6 @@ use crate::feeds::Footprint; -use scraper::{ElementRef, Html, Selector}; +use scraper::{selector::CssLocalName, CaseSensitivity, Element, ElementRef, Html, Selector}; +use serde::Deserialize; use thiserror::Error; use time::{format_description, Date}; @@ -11,6 +12,12 @@ pub enum ScrapeError { SelectorError(#[from] scraper::error::SelectorErrorKind<'static>), } +#[derive(Clone, Debug, Deserialize)] +pub enum Media { + Image(String), + Video(String), +} + pub async fn feed_title(feed_url: &str) -> Result { let resp = reqwest::get(feed_url).await?.text().await?; let doc = Html::parse_document(&resp); @@ -80,7 +87,7 @@ fn parse_footprint( .trim() .to_string(); - let images = scrape_img_links(&footprint_el).ok()?; + let media = scrape_media_links(&footprint_el).ok()?; Some(Footprint { title, @@ -88,7 +95,7 @@ fn parse_footprint( date, text, page, - images, + media, }) } @@ -107,26 +114,44 @@ fn attr_from_el(el: &ElementRef, attribute: &str) -> Option { Some(el.value().attr(attribute)?.to_string()) } -fn scrape_img_links(footprint_el: &ElementRef) -> Result, ScrapeError> { - let main_img_selector = Selector::parse("div.images-container > a.image")?; - let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?; +fn scrape_media_links(footprint_el: &ElementRef) -> Result, ScrapeError> { + let main_media_selector = Selector::parse("div.images-container > a.image")?; + let other_media_selector = + Selector::parse("div.images-container > div.thumbs > a.image.thumb.image")?; - let mut img_urls = Vec::new(); + let mut media = Vec::new(); - if let Some(main_img) = footprint_el.select(&main_img_selector).next() { - if let Some(main_img_url) = attr_from_el(&main_img, "data-url") { - img_urls.push(main_img_url); + if let Some(main_media) = footprint_el.select(&main_media_selector).next() { + if let Some(main_media) = concrete_media(&main_media) { + media.push(main_media); } } - for img in footprint_el.select(&other_img_selector) { - if let Some(main_img_url) = attr_from_el(&img, "data-url") { - img_urls.push(main_img_url); + for other_media in footprint_el.select(&other_media_selector) { + if let Some(other_media) = concrete_media(&other_media) { + media.push(other_media); } } - let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect(); - Ok(img_urls) + Ok(media) +} + +fn concrete_media(media: &ElementRef) -> Option { + if let Some(media_url) = attr_from_el(media, "data-url") { + let media_url = format!("https:{}", media_url); + Some( + if media.has_class( + &CssLocalName("photo".into()), + CaseSensitivity::AsciiCaseInsensitive, + ) { + Media::Image(media_url) + } else { + Media::Video(media_url) + }, + ) + } else { + None + } } pub fn page_url(feed_url: &str, page: u8) -> String { diff --git a/templates/feed_entry.html b/templates/feed_entry.html index c9f8000..305ce35 100644 --- a/templates/feed_entry.html +++ b/templates/feed_entry.html @@ -1,7 +1,10 @@ {{ text }}

---

-from page {{ page }} +from page {{ page }}
-{% for img in images %} -
-{% endfor %} +{% for media_entry in media %} {% match media_entry %} {% when Media::Image with +(url) %} +
+{% when Media::Video with (url) %} +