correctly recognize videos
Some checks failed
Build Multiarch Container Image / call-reusable-workflow (push) Has been cancelled

This commit is contained in:
Sebastian Hugentobler 2024-01-07 14:35:31 +01:00
parent 92f7a489fb
commit 951cd9042f
Signed by: shu
GPG Key ID: BB32CF3CA052C2F0
7 changed files with 56 additions and 26 deletions

2
Cargo.lock generated
View File

@ -516,7 +516,7 @@ dependencies = [
[[package]] [[package]]
name = "findpenguins-feed" name = "findpenguins-feed"
version = "0.3.0" version = "0.4.0"
dependencies = [ dependencies = [
"askama", "askama",
"axum", "axum",

View File

@ -1,6 +1,6 @@
[package] [package]
name = "findpenguins-feed" name = "findpenguins-feed"
version = "0.3.0" version = "0.4.0"
edition = "2021" edition = "2021"
authors = ["Sebastian Hugentobler <shu@vanwa.ch>"] authors = ["Sebastian Hugentobler <shu@vanwa.ch>"]
license = "AGPL-3.0-or-later" license = "AGPL-3.0-or-later"

View File

@ -1,4 +1,4 @@
FROM docker.io/rust:1-alpine3.18 AS builder FROM docker.io/rust:1-alpine3.19 AS builder
RUN apk --no-cache add musl-dev RUN apk --no-cache add musl-dev

View File

@ -9,7 +9,7 @@ use time::format_description::well_known::iso8601::FormattedComponents;
use time::format_description::well_known::{iso8601, Iso8601}; use time::format_description::well_known::{iso8601, Iso8601};
use time::Date; use time::Date;
use crate::scrapers::page_url; use crate::scrapers::{page_url, Media};
use crate::{hash, scrapers}; use crate::{hash, scrapers};
use self::template::FeedEntryTemplate; use self::template::FeedEntryTemplate;
@ -37,7 +37,7 @@ pub struct Footprint {
pub url: String, pub url: String,
pub date: Date, pub date: Date,
pub page: u8, pub page: u8,
pub images: Vec<String>, pub media: Vec<Media>,
} }
const ISO8601_DATE: u128 = iso8601::Config::DEFAULT const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
@ -58,7 +58,7 @@ impl Footprint {
text: text.to_string(), text: text.to_string(),
page_url, page_url,
page: self.page, page: self.page,
images: self.images, media: self.media,
}; };
let content = template.render().ok(); let content = template.render().ok();

View File

@ -1,5 +1,7 @@
use askama::Template; use askama::Template;
use crate::scrapers::Media;
use super::Feed; use super::Feed;
#[derive(Template)] #[derive(Template)]
@ -14,5 +16,5 @@ pub struct FeedEntryTemplate {
pub text: String, pub text: String,
pub page_url: String, pub page_url: String,
pub page: u8, pub page: u8,
pub images: Vec<String>, pub media: Vec<Media>,
} }

View File

@ -1,5 +1,6 @@
use crate::feeds::Footprint; use crate::feeds::Footprint;
use scraper::{ElementRef, Html, Selector}; use scraper::{selector::CssLocalName, CaseSensitivity, Element, ElementRef, Html, Selector};
use serde::Deserialize;
use thiserror::Error; use thiserror::Error;
use time::{format_description, Date}; use time::{format_description, Date};
@ -11,6 +12,12 @@ pub enum ScrapeError {
SelectorError(#[from] scraper::error::SelectorErrorKind<'static>), SelectorError(#[from] scraper::error::SelectorErrorKind<'static>),
} }
#[derive(Clone, Debug, Deserialize)]
pub enum Media {
Image(String),
Video(String),
}
pub async fn feed_title(feed_url: &str) -> Result<String, ScrapeError> { pub async fn feed_title(feed_url: &str) -> Result<String, ScrapeError> {
let resp = reqwest::get(feed_url).await?.text().await?; let resp = reqwest::get(feed_url).await?.text().await?;
let doc = Html::parse_document(&resp); let doc = Html::parse_document(&resp);
@ -80,7 +87,7 @@ fn parse_footprint(
.trim() .trim()
.to_string(); .to_string();
let images = scrape_img_links(&footprint_el).ok()?; let media = scrape_media_links(&footprint_el).ok()?;
Some(Footprint { Some(Footprint {
title, title,
@ -88,7 +95,7 @@ fn parse_footprint(
date, date,
text, text,
page, page,
images, media,
}) })
} }
@ -107,26 +114,44 @@ fn attr_from_el(el: &ElementRef, attribute: &str) -> Option<String> {
Some(el.value().attr(attribute)?.to_string()) Some(el.value().attr(attribute)?.to_string())
} }
fn scrape_img_links(footprint_el: &ElementRef) -> Result<Vec<String>, ScrapeError> { fn scrape_media_links(footprint_el: &ElementRef) -> Result<Vec<Media>, ScrapeError> {
let main_img_selector = Selector::parse("div.images-container > a.image")?; let main_media_selector = Selector::parse("div.images-container > a.image")?;
let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?; let other_media_selector =
Selector::parse("div.images-container > div.thumbs > a.image.thumb.image")?;
let mut img_urls = Vec::new(); let mut media = Vec::new();
if let Some(main_img) = footprint_el.select(&main_img_selector).next() { if let Some(main_media) = footprint_el.select(&main_media_selector).next() {
if let Some(main_img_url) = attr_from_el(&main_img, "data-url") { if let Some(main_media) = concrete_media(&main_media) {
img_urls.push(main_img_url); media.push(main_media);
} }
} }
for img in footprint_el.select(&other_img_selector) { for other_media in footprint_el.select(&other_media_selector) {
if let Some(main_img_url) = attr_from_el(&img, "data-url") { if let Some(other_media) = concrete_media(&other_media) {
img_urls.push(main_img_url); media.push(other_media);
} }
} }
let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect();
Ok(img_urls) Ok(media)
}
fn concrete_media(media: &ElementRef) -> Option<Media> {
if let Some(media_url) = attr_from_el(media, "data-url") {
let media_url = format!("https:{}", media_url);
Some(
if media.has_class(
&CssLocalName("photo".into()),
CaseSensitivity::AsciiCaseInsensitive,
) {
Media::Image(media_url)
} else {
Media::Video(media_url)
},
)
} else {
None
}
} }
pub fn page_url(feed_url: &str, page: u8) -> String { pub fn page_url(feed_url: &str, page: u8) -> String {

View File

@ -1,7 +1,10 @@
{{ text }} <br /><br /> {{ text }} <br /><br />
--- <br /><br /> --- <br /><br />
from <a href="{{ page_url }}">page {{ page }}</a> from <a href="{{ page_url }}">page {{ page }}</a><br />
{% for img in images %} {% for media_entry in media %} {% match media_entry %} {% when Media::Image with
<img src="{{ img }}" /><br /> (url) %}
{% endfor %} <img src="{{ url }}" /><br />
{% when Media::Video with (url) %}
<video src="{{ url }}" /><br />
{% endmatch %} {% endfor %}