From 75c5a47449c6aef3c8f8b0d48ab1db4619d404e8 Mon Sep 17 00:00:00 2001 From: Sebastian Hugentobler Date: Mon, 27 Nov 2023 10:49:50 +0100 Subject: [PATCH 01/10] use templates to create the feed entries --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/feeds/mod.rs | 35 +++++++++++++++-------------------- src/feeds/template.rs | 9 +++++++++ templates/feed_entry.html | 7 +++++++ 5 files changed, 33 insertions(+), 22 deletions(-) create mode 100644 templates/feed_entry.html diff --git a/Cargo.lock b/Cargo.lock index e192f66..578e573 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -516,7 +516,7 @@ dependencies = [ [[package]] name = "findpenguins-feed" -version = "0.2.0" +version = "0.3.0" dependencies = [ "askama", "axum", diff --git a/Cargo.toml b/Cargo.toml index e9b5885..e2b08a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "findpenguins-feed" -version = "0.2.0" +version = "0.3.0" edition = "2021" authors = ["Sebastian Hugentobler "] license = "AGPL-3.0-or-later" diff --git a/src/feeds/mod.rs b/src/feeds/mod.rs index 0625139..bc30452 100644 --- a/src/feeds/mod.rs +++ b/src/feeds/mod.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fmt; +use askama::Template; use futures::future::join_all; use rss::{GuidBuilder, Item, ItemBuilder}; use serde::Deserialize; @@ -11,6 +12,8 @@ use time::Date; use crate::scrapers::page_url; use crate::{hash, scrapers}; +use self::template::FeedEntryTemplate; + pub mod route; pub mod template; @@ -44,34 +47,26 @@ const ISO8601_DATE: u128 = iso8601::Config::DEFAULT impl Footprint { pub fn into_rss_item(self, root_url: &str) -> Item { let text = if String::is_empty(&self.text) { - "No description" + "No text" } else { &self.text }; - // injection, I know - let desc = format!( - "{}

---

from page {}", - text, - page_url(root_url, self.page), - self.page - ); - // injection again - let content = self - .images - .iter() - .map(|x| format!("
", x)) - .fold(String::new(), |mut a, b| { - a.reserve(b.len()); - a.push_str(&b); - a - }); + + let page_url = page_url(root_url, self.page); + + let template = FeedEntryTemplate { + text: text.to_string(), + page_url, + page: self.page, + images: self.images, + }; + let content = template.render().ok(); ItemBuilder::default() .title(Some(self.title)) .pub_date(self.date.format(&Iso8601::).ok()) .link(Some(self.url.clone())) - .description(Some(desc)) - .content(Some(content)) + .content(content) .guid(Some(GuidBuilder::default().value(self.url).build())) .build() } diff --git a/src/feeds/template.rs b/src/feeds/template.rs index 6fc8f50..8a61300 100644 --- a/src/feeds/template.rs +++ b/src/feeds/template.rs @@ -7,3 +7,12 @@ use super::Feed; pub struct FeedsTemplate { pub feeds: Vec, } + +#[derive(Template)] +#[template(path = "feed_entry.html")] +pub struct FeedEntryTemplate { + pub text: String, + pub page_url: String, + pub page: u8, + pub images: Vec, +} diff --git a/templates/feed_entry.html b/templates/feed_entry.html new file mode 100644 index 0000000..c9f8000 --- /dev/null +++ b/templates/feed_entry.html @@ -0,0 +1,7 @@ +{{ text }}

+---

+from page {{ page }} + +{% for img in images %} +
+{% endfor %} From 92f7a489fb2694c59fba8f1cd94ccd65c2d06608 Mon Sep 17 00:00:00 2001 From: Sebastian Hugentobler Date: Fri, 15 Dec 2023 07:57:54 +0100 Subject: [PATCH 02/10] use gitea actions --- .gitea/workflows/container.yaml | 12 ++++++++++++ .woodpecker.yml | 11 ----------- Containerfile | 9 +++------ 3 files changed, 15 insertions(+), 17 deletions(-) create mode 100644 .gitea/workflows/container.yaml delete mode 100644 .woodpecker.yml diff --git a/.gitea/workflows/container.yaml b/.gitea/workflows/container.yaml new file mode 100644 index 0000000..e48b3fd --- /dev/null +++ b/.gitea/workflows/container.yaml @@ -0,0 +1,12 @@ +name: Build Multiarch Container Image +on: [push] +jobs: + call-reusable-workflow: + uses: container/multiarch-build-workflow/.gitea/workflows/build.yaml@main + with: + repository: ${{ gitea.repository }} + ref_name: ${{ gitea.ref_name }} + sha: ${{ gitea.sha }} + registry_url: ${{ secrets.REGISTRY_URL }} + registry_user: ${{ secrets.REGISTRY_USER }} + registry_pw: ${{ secrets.REGISTRY_PW }} diff --git a/.woodpecker.yml b/.woodpecker.yml deleted file mode 100644 index efb62a7..0000000 --- a/.woodpecker.yml +++ /dev/null @@ -1,11 +0,0 @@ -pipeline: - publish-docker-image: - image: plugins/kaniko - settings: - repo: docker.io/thallian/fp-feed - tags: latest,${CI_COMMIT_SHA:0:8},${CI_COMMIT_TAG=pre} - dockerfile: Containerfile - username: - from_secret: DOCKER_USER - password: - from_secret: DOCKER_PW diff --git a/Containerfile b/Containerfile index aa08e08..e6186da 100644 --- a/Containerfile +++ b/Containerfile @@ -1,26 +1,23 @@ FROM docker.io/rust:1-alpine3.18 AS builder -ARG ARCH=x86_64 - RUN apk --no-cache add musl-dev ENV CARGO_CARGO_NEW_VCS="none" ENV CARGO_BUILD_RUSTFLAGS="-C target-feature=+crt-static" -ENV CARGO_BUILD_TARGET="$ARCH-unknown-linux-musl" WORKDIR /work RUN cargo init COPY Cargo.toml Cargo.lock ./ -RUN cargo build --release +RUN cargo build --release --target=$(arch)-unknown-linux-musl COPY . . # ensure rebuilding of the app RUN touch src/main.rs -RUN cargo build --release -RUN cp "./target/$CARGO_BUILD_TARGET/release/findpenguins-feed" /app +RUN cargo build --release --target=$(arch)-unknown-linux-musl +RUN cp "./target/$(arch)-unknown-linux-musl/release/findpenguins-feed" /app FROM scratch From 951cd9042f7eff3f6621050e3877179eeb478989 Mon Sep 17 00:00:00 2001 From: Sebastian Hugentobler Date: Sun, 7 Jan 2024 14:35:31 +0100 Subject: [PATCH 03/10] correctly recognize videos --- Cargo.lock | 2 +- Cargo.toml | 2 +- Containerfile | 2 +- src/feeds/mod.rs | 6 ++--- src/feeds/template.rs | 4 ++- src/scrapers.rs | 55 ++++++++++++++++++++++++++++----------- templates/feed_entry.html | 11 +++++--- 7 files changed, 56 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 578e573..93fbc79 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -516,7 +516,7 @@ dependencies = [ [[package]] name = "findpenguins-feed" -version = "0.3.0" +version = "0.4.0" dependencies = [ "askama", "axum", diff --git a/Cargo.toml b/Cargo.toml index e2b08a6..3eb0596 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "findpenguins-feed" -version = "0.3.0" +version = "0.4.0" edition = "2021" authors = ["Sebastian Hugentobler "] license = "AGPL-3.0-or-later" diff --git a/Containerfile b/Containerfile index e6186da..4647635 100644 --- a/Containerfile +++ b/Containerfile @@ -1,4 +1,4 @@ -FROM docker.io/rust:1-alpine3.18 AS builder +FROM docker.io/rust:1-alpine3.19 AS builder RUN apk --no-cache add musl-dev diff --git a/src/feeds/mod.rs b/src/feeds/mod.rs index bc30452..1c6a0e7 100644 --- a/src/feeds/mod.rs +++ b/src/feeds/mod.rs @@ -9,7 +9,7 @@ use time::format_description::well_known::iso8601::FormattedComponents; use time::format_description::well_known::{iso8601, Iso8601}; use time::Date; -use crate::scrapers::page_url; +use crate::scrapers::{page_url, Media}; use crate::{hash, scrapers}; use self::template::FeedEntryTemplate; @@ -37,7 +37,7 @@ pub struct Footprint { pub url: String, pub date: Date, pub page: u8, - pub images: Vec, + pub media: Vec, } const ISO8601_DATE: u128 = iso8601::Config::DEFAULT @@ -58,7 +58,7 @@ impl Footprint { text: text.to_string(), page_url, page: self.page, - images: self.images, + media: self.media, }; let content = template.render().ok(); diff --git a/src/feeds/template.rs b/src/feeds/template.rs index 8a61300..4848edf 100644 --- a/src/feeds/template.rs +++ b/src/feeds/template.rs @@ -1,5 +1,7 @@ use askama::Template; +use crate::scrapers::Media; + use super::Feed; #[derive(Template)] @@ -14,5 +16,5 @@ pub struct FeedEntryTemplate { pub text: String, pub page_url: String, pub page: u8, - pub images: Vec, + pub media: Vec, } diff --git a/src/scrapers.rs b/src/scrapers.rs index 32b6a02..91f5fbe 100644 --- a/src/scrapers.rs +++ b/src/scrapers.rs @@ -1,5 +1,6 @@ use crate::feeds::Footprint; -use scraper::{ElementRef, Html, Selector}; +use scraper::{selector::CssLocalName, CaseSensitivity, Element, ElementRef, Html, Selector}; +use serde::Deserialize; use thiserror::Error; use time::{format_description, Date}; @@ -11,6 +12,12 @@ pub enum ScrapeError { SelectorError(#[from] scraper::error::SelectorErrorKind<'static>), } +#[derive(Clone, Debug, Deserialize)] +pub enum Media { + Image(String), + Video(String), +} + pub async fn feed_title(feed_url: &str) -> Result { let resp = reqwest::get(feed_url).await?.text().await?; let doc = Html::parse_document(&resp); @@ -80,7 +87,7 @@ fn parse_footprint( .trim() .to_string(); - let images = scrape_img_links(&footprint_el).ok()?; + let media = scrape_media_links(&footprint_el).ok()?; Some(Footprint { title, @@ -88,7 +95,7 @@ fn parse_footprint( date, text, page, - images, + media, }) } @@ -107,26 +114,44 @@ fn attr_from_el(el: &ElementRef, attribute: &str) -> Option { Some(el.value().attr(attribute)?.to_string()) } -fn scrape_img_links(footprint_el: &ElementRef) -> Result, ScrapeError> { - let main_img_selector = Selector::parse("div.images-container > a.image")?; - let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?; +fn scrape_media_links(footprint_el: &ElementRef) -> Result, ScrapeError> { + let main_media_selector = Selector::parse("div.images-container > a.image")?; + let other_media_selector = + Selector::parse("div.images-container > div.thumbs > a.image.thumb.image")?; - let mut img_urls = Vec::new(); + let mut media = Vec::new(); - if let Some(main_img) = footprint_el.select(&main_img_selector).next() { - if let Some(main_img_url) = attr_from_el(&main_img, "data-url") { - img_urls.push(main_img_url); + if let Some(main_media) = footprint_el.select(&main_media_selector).next() { + if let Some(main_media) = concrete_media(&main_media) { + media.push(main_media); } } - for img in footprint_el.select(&other_img_selector) { - if let Some(main_img_url) = attr_from_el(&img, "data-url") { - img_urls.push(main_img_url); + for other_media in footprint_el.select(&other_media_selector) { + if let Some(other_media) = concrete_media(&other_media) { + media.push(other_media); } } - let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect(); - Ok(img_urls) + Ok(media) +} + +fn concrete_media(media: &ElementRef) -> Option { + if let Some(media_url) = attr_from_el(media, "data-url") { + let media_url = format!("https:{}", media_url); + Some( + if media.has_class( + &CssLocalName("photo".into()), + CaseSensitivity::AsciiCaseInsensitive, + ) { + Media::Image(media_url) + } else { + Media::Video(media_url) + }, + ) + } else { + None + } } pub fn page_url(feed_url: &str, page: u8) -> String { diff --git a/templates/feed_entry.html b/templates/feed_entry.html index c9f8000..305ce35 100644 --- a/templates/feed_entry.html +++ b/templates/feed_entry.html @@ -1,7 +1,10 @@ {{ text }}

---

-from page {{ page }} +from page {{ page }}
-{% for img in images %} -
-{% endfor %} +{% for media_entry in media %} {% match media_entry %} {% when Media::Image with +(url) %} +
+{% when Media::Video with (url) %} +