correctly recognize videos
Some checks failed
Build Multiarch Container Image / call-reusable-workflow (push) Has been cancelled
Some checks failed
Build Multiarch Container Image / call-reusable-workflow (push) Has been cancelled
This commit is contained in:
parent
92f7a489fb
commit
951cd9042f
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -516,7 +516,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "findpenguins-feed"
|
name = "findpenguins-feed"
|
||||||
version = "0.3.0"
|
version = "0.4.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"askama",
|
"askama",
|
||||||
"axum",
|
"axum",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "findpenguins-feed"
|
name = "findpenguins-feed"
|
||||||
version = "0.3.0"
|
version = "0.4.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
authors = ["Sebastian Hugentobler <shu@vanwa.ch>"]
|
authors = ["Sebastian Hugentobler <shu@vanwa.ch>"]
|
||||||
license = "AGPL-3.0-or-later"
|
license = "AGPL-3.0-or-later"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
FROM docker.io/rust:1-alpine3.18 AS builder
|
FROM docker.io/rust:1-alpine3.19 AS builder
|
||||||
|
|
||||||
RUN apk --no-cache add musl-dev
|
RUN apk --no-cache add musl-dev
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ use time::format_description::well_known::iso8601::FormattedComponents;
|
|||||||
use time::format_description::well_known::{iso8601, Iso8601};
|
use time::format_description::well_known::{iso8601, Iso8601};
|
||||||
use time::Date;
|
use time::Date;
|
||||||
|
|
||||||
use crate::scrapers::page_url;
|
use crate::scrapers::{page_url, Media};
|
||||||
use crate::{hash, scrapers};
|
use crate::{hash, scrapers};
|
||||||
|
|
||||||
use self::template::FeedEntryTemplate;
|
use self::template::FeedEntryTemplate;
|
||||||
@ -37,7 +37,7 @@ pub struct Footprint {
|
|||||||
pub url: String,
|
pub url: String,
|
||||||
pub date: Date,
|
pub date: Date,
|
||||||
pub page: u8,
|
pub page: u8,
|
||||||
pub images: Vec<String>,
|
pub media: Vec<Media>,
|
||||||
}
|
}
|
||||||
|
|
||||||
const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
|
const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
|
||||||
@ -58,7 +58,7 @@ impl Footprint {
|
|||||||
text: text.to_string(),
|
text: text.to_string(),
|
||||||
page_url,
|
page_url,
|
||||||
page: self.page,
|
page: self.page,
|
||||||
images: self.images,
|
media: self.media,
|
||||||
};
|
};
|
||||||
let content = template.render().ok();
|
let content = template.render().ok();
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
use askama::Template;
|
use askama::Template;
|
||||||
|
|
||||||
|
use crate::scrapers::Media;
|
||||||
|
|
||||||
use super::Feed;
|
use super::Feed;
|
||||||
|
|
||||||
#[derive(Template)]
|
#[derive(Template)]
|
||||||
@ -14,5 +16,5 @@ pub struct FeedEntryTemplate {
|
|||||||
pub text: String,
|
pub text: String,
|
||||||
pub page_url: String,
|
pub page_url: String,
|
||||||
pub page: u8,
|
pub page: u8,
|
||||||
pub images: Vec<String>,
|
pub media: Vec<Media>,
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
use crate::feeds::Footprint;
|
use crate::feeds::Footprint;
|
||||||
use scraper::{ElementRef, Html, Selector};
|
use scraper::{selector::CssLocalName, CaseSensitivity, Element, ElementRef, Html, Selector};
|
||||||
|
use serde::Deserialize;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use time::{format_description, Date};
|
use time::{format_description, Date};
|
||||||
|
|
||||||
@ -11,6 +12,12 @@ pub enum ScrapeError {
|
|||||||
SelectorError(#[from] scraper::error::SelectorErrorKind<'static>),
|
SelectorError(#[from] scraper::error::SelectorErrorKind<'static>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
|
pub enum Media {
|
||||||
|
Image(String),
|
||||||
|
Video(String),
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn feed_title(feed_url: &str) -> Result<String, ScrapeError> {
|
pub async fn feed_title(feed_url: &str) -> Result<String, ScrapeError> {
|
||||||
let resp = reqwest::get(feed_url).await?.text().await?;
|
let resp = reqwest::get(feed_url).await?.text().await?;
|
||||||
let doc = Html::parse_document(&resp);
|
let doc = Html::parse_document(&resp);
|
||||||
@ -80,7 +87,7 @@ fn parse_footprint(
|
|||||||
.trim()
|
.trim()
|
||||||
.to_string();
|
.to_string();
|
||||||
|
|
||||||
let images = scrape_img_links(&footprint_el).ok()?;
|
let media = scrape_media_links(&footprint_el).ok()?;
|
||||||
|
|
||||||
Some(Footprint {
|
Some(Footprint {
|
||||||
title,
|
title,
|
||||||
@ -88,7 +95,7 @@ fn parse_footprint(
|
|||||||
date,
|
date,
|
||||||
text,
|
text,
|
||||||
page,
|
page,
|
||||||
images,
|
media,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -107,26 +114,44 @@ fn attr_from_el(el: &ElementRef, attribute: &str) -> Option<String> {
|
|||||||
Some(el.value().attr(attribute)?.to_string())
|
Some(el.value().attr(attribute)?.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn scrape_img_links(footprint_el: &ElementRef) -> Result<Vec<String>, ScrapeError> {
|
fn scrape_media_links(footprint_el: &ElementRef) -> Result<Vec<Media>, ScrapeError> {
|
||||||
let main_img_selector = Selector::parse("div.images-container > a.image")?;
|
let main_media_selector = Selector::parse("div.images-container > a.image")?;
|
||||||
let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?;
|
let other_media_selector =
|
||||||
|
Selector::parse("div.images-container > div.thumbs > a.image.thumb.image")?;
|
||||||
|
|
||||||
let mut img_urls = Vec::new();
|
let mut media = Vec::new();
|
||||||
|
|
||||||
if let Some(main_img) = footprint_el.select(&main_img_selector).next() {
|
if let Some(main_media) = footprint_el.select(&main_media_selector).next() {
|
||||||
if let Some(main_img_url) = attr_from_el(&main_img, "data-url") {
|
if let Some(main_media) = concrete_media(&main_media) {
|
||||||
img_urls.push(main_img_url);
|
media.push(main_media);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for img in footprint_el.select(&other_img_selector) {
|
for other_media in footprint_el.select(&other_media_selector) {
|
||||||
if let Some(main_img_url) = attr_from_el(&img, "data-url") {
|
if let Some(other_media) = concrete_media(&other_media) {
|
||||||
img_urls.push(main_img_url);
|
media.push(other_media);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect();
|
|
||||||
|
|
||||||
Ok(img_urls)
|
Ok(media)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn concrete_media(media: &ElementRef) -> Option<Media> {
|
||||||
|
if let Some(media_url) = attr_from_el(media, "data-url") {
|
||||||
|
let media_url = format!("https:{}", media_url);
|
||||||
|
Some(
|
||||||
|
if media.has_class(
|
||||||
|
&CssLocalName("photo".into()),
|
||||||
|
CaseSensitivity::AsciiCaseInsensitive,
|
||||||
|
) {
|
||||||
|
Media::Image(media_url)
|
||||||
|
} else {
|
||||||
|
Media::Video(media_url)
|
||||||
|
},
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn page_url(feed_url: &str, page: u8) -> String {
|
pub fn page_url(feed_url: &str, page: u8) -> String {
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
{{ text }} <br /><br />
|
{{ text }} <br /><br />
|
||||||
--- <br /><br />
|
--- <br /><br />
|
||||||
from <a href="{{ page_url }}">page {{ page }}</a>
|
from <a href="{{ page_url }}">page {{ page }}</a><br />
|
||||||
|
|
||||||
{% for img in images %}
|
{% for media_entry in media %} {% match media_entry %} {% when Media::Image with
|
||||||
<img src="{{ img }}" /><br />
|
(url) %}
|
||||||
{% endfor %}
|
<img src="{{ url }}" /><br />
|
||||||
|
{% when Media::Video with (url) %}
|
||||||
|
<video src="{{ url }}" /><br />
|
||||||
|
{% endmatch %} {% endfor %}
|
||||||
|
Loading…
Reference in New Issue
Block a user