correctly recognize videos
Build Multiarch Container Image / call-reusable-workflow (push) Has been cancelled Details

This commit is contained in:
Sebastian Hugentobler 2024-01-07 14:35:31 +01:00
parent 92f7a489fb
commit 951cd9042f
Signed by: shu
GPG Key ID: BB32CF3CA052C2F0
7 changed files with 56 additions and 26 deletions

2
Cargo.lock generated
View File

@ -516,7 +516,7 @@ dependencies = [
[[package]]
name = "findpenguins-feed"
version = "0.3.0"
version = "0.4.0"
dependencies = [
"askama",
"axum",

View File

@ -1,6 +1,6 @@
[package]
name = "findpenguins-feed"
version = "0.3.0"
version = "0.4.0"
edition = "2021"
authors = ["Sebastian Hugentobler <shu@vanwa.ch>"]
license = "AGPL-3.0-or-later"

View File

@ -1,4 +1,4 @@
FROM docker.io/rust:1-alpine3.18 AS builder
FROM docker.io/rust:1-alpine3.19 AS builder
RUN apk --no-cache add musl-dev

View File

@ -9,7 +9,7 @@ use time::format_description::well_known::iso8601::FormattedComponents;
use time::format_description::well_known::{iso8601, Iso8601};
use time::Date;
use crate::scrapers::page_url;
use crate::scrapers::{page_url, Media};
use crate::{hash, scrapers};
use self::template::FeedEntryTemplate;
@ -37,7 +37,7 @@ pub struct Footprint {
pub url: String,
pub date: Date,
pub page: u8,
pub images: Vec<String>,
pub media: Vec<Media>,
}
const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
@ -58,7 +58,7 @@ impl Footprint {
text: text.to_string(),
page_url,
page: self.page,
images: self.images,
media: self.media,
};
let content = template.render().ok();

View File

@ -1,5 +1,7 @@
use askama::Template;
use crate::scrapers::Media;
use super::Feed;
#[derive(Template)]
@ -14,5 +16,5 @@ pub struct FeedEntryTemplate {
pub text: String,
pub page_url: String,
pub page: u8,
pub images: Vec<String>,
pub media: Vec<Media>,
}

View File

@ -1,5 +1,6 @@
use crate::feeds::Footprint;
use scraper::{ElementRef, Html, Selector};
use scraper::{selector::CssLocalName, CaseSensitivity, Element, ElementRef, Html, Selector};
use serde::Deserialize;
use thiserror::Error;
use time::{format_description, Date};
@ -11,6 +12,12 @@ pub enum ScrapeError {
SelectorError(#[from] scraper::error::SelectorErrorKind<'static>),
}
#[derive(Clone, Debug, Deserialize)]
pub enum Media {
Image(String),
Video(String),
}
pub async fn feed_title(feed_url: &str) -> Result<String, ScrapeError> {
let resp = reqwest::get(feed_url).await?.text().await?;
let doc = Html::parse_document(&resp);
@ -80,7 +87,7 @@ fn parse_footprint(
.trim()
.to_string();
let images = scrape_img_links(&footprint_el).ok()?;
let media = scrape_media_links(&footprint_el).ok()?;
Some(Footprint {
title,
@ -88,7 +95,7 @@ fn parse_footprint(
date,
text,
page,
images,
media,
})
}
@ -107,26 +114,44 @@ fn attr_from_el(el: &ElementRef, attribute: &str) -> Option<String> {
Some(el.value().attr(attribute)?.to_string())
}
fn scrape_img_links(footprint_el: &ElementRef) -> Result<Vec<String>, ScrapeError> {
let main_img_selector = Selector::parse("div.images-container > a.image")?;
let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?;
fn scrape_media_links(footprint_el: &ElementRef) -> Result<Vec<Media>, ScrapeError> {
let main_media_selector = Selector::parse("div.images-container > a.image")?;
let other_media_selector =
Selector::parse("div.images-container > div.thumbs > a.image.thumb.image")?;
let mut img_urls = Vec::new();
let mut media = Vec::new();
if let Some(main_img) = footprint_el.select(&main_img_selector).next() {
if let Some(main_img_url) = attr_from_el(&main_img, "data-url") {
img_urls.push(main_img_url);
if let Some(main_media) = footprint_el.select(&main_media_selector).next() {
if let Some(main_media) = concrete_media(&main_media) {
media.push(main_media);
}
}
for img in footprint_el.select(&other_img_selector) {
if let Some(main_img_url) = attr_from_el(&img, "data-url") {
img_urls.push(main_img_url);
for other_media in footprint_el.select(&other_media_selector) {
if let Some(other_media) = concrete_media(&other_media) {
media.push(other_media);
}
}
let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect();
Ok(img_urls)
Ok(media)
}
fn concrete_media(media: &ElementRef) -> Option<Media> {
if let Some(media_url) = attr_from_el(media, "data-url") {
let media_url = format!("https:{}", media_url);
Some(
if media.has_class(
&CssLocalName("photo".into()),
CaseSensitivity::AsciiCaseInsensitive,
) {
Media::Image(media_url)
} else {
Media::Video(media_url)
},
)
} else {
None
}
}
pub fn page_url(feed_url: &str, page: u8) -> String {

View File

@ -1,7 +1,10 @@
{{ text }} <br /><br />
--- <br /><br />
from <a href="{{ page_url }}">page {{ page }}</a>
from <a href="{{ page_url }}">page {{ page }}</a><br />
{% for img in images %}
<img src="{{ img }}" /><br />
{% endfor %}
{% for media_entry in media %} {% match media_entry %} {% when Media::Image with
(url) %}
<img src="{{ url }}" /><br />
{% when Media::Video with (url) %}
<video src="{{ url }}" /><br />
{% endmatch %} {% endfor %}