Compare commits

...

6 Commits

7 changed files with 69 additions and 23 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
/target /target
.envrc

2
Cargo.lock generated
View File

@ -516,7 +516,7 @@ dependencies = [
[[package]] [[package]]
name = "findpenguins-feed" name = "findpenguins-feed"
version = "0.1.1" version = "0.2.0"
dependencies = [ dependencies = [
"askama", "askama",
"axum", "axum",

View File

@ -1,6 +1,6 @@
[package] [package]
name = "findpenguins-feed" name = "findpenguins-feed"
version = "0.1.1" version = "0.2.0"
edition = "2021" edition = "2021"
authors = ["Sebastian Hugentobler <shu@vanwa.ch>"] authors = ["Sebastian Hugentobler <shu@vanwa.ch>"]
license = "AGPL-3.0-or-later" license = "AGPL-3.0-or-later"

View File

@ -21,7 +21,7 @@ executable.
# Configuration # Configuration
Configuration is handled with a confoguration file and/or environment variables. Configuration is handled with a configuration file and/or environment variables.
The configuration file is sourced from the following locations, depending on the The configuration file is sourced from the following locations, depending on the
operating system: operating system:
@ -30,6 +30,8 @@ operating system:
- Windows: `{FOLDERID_RoamingAppData}/findpenguins-feed/config/config.toml` - Windows: `{FOLDERID_RoamingAppData}/findpenguins-feed/config/config.toml`
- macOS: `$HOME/Library/Preferences/ch.vanwa.findpenguins-feed/config.toml` - macOS: `$HOME/Library/Preferences/ch.vanwa.findpenguins-feed/config.toml`
Environment variable always take precedence.
## server_address ## server_address
IP and port the server should listen on. IP and port the server should listen on.

View File

@ -34,6 +34,7 @@ pub struct Footprint {
pub url: String, pub url: String,
pub date: Date, pub date: Date,
pub page: u8, pub page: u8,
pub images: Vec<String>,
} }
const ISO8601_DATE: u128 = iso8601::Config::DEFAULT const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
@ -54,11 +55,23 @@ impl Footprint {
page_url(root_url, self.page), page_url(root_url, self.page),
self.page self.page
); );
// injection again
let content = self
.images
.iter()
.map(|x| format!("<img src=\"{}\" /><br />", x))
.fold(String::new(), |mut a, b| {
a.reserve(b.len());
a.push_str(&b);
a
});
ItemBuilder::default() ItemBuilder::default()
.title(Some(self.title)) .title(Some(self.title))
.pub_date(self.date.format(&Iso8601::<ISO8601_DATE>).ok()) .pub_date(self.date.format(&Iso8601::<ISO8601_DATE>).ok())
.link(Some(self.url.clone())) .link(Some(self.url.clone()))
.description(Some(desc)) .description(Some(desc))
.content(Some(content))
.guid(Some(GuidBuilder::default().value(self.url).build())) .guid(Some(GuidBuilder::default().value(self.url).build()))
.build() .build()
} }

View File

@ -10,5 +10,5 @@ pub(crate) fn setup(bin_name: &str) {
) )
.with(tracing_subscriber::fmt::layer().with_target(true)) .with(tracing_subscriber::fmt::layer().with_target(true))
.init(); .init();
debug!("tracing/logging is setup"); debug!("tracing/logging initialized");
} }

View File

@ -28,8 +28,6 @@ pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeEr
let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?; let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?;
let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?; let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?;
let more_selector = Selector::parse("a#footprintListLoadMore")?; let more_selector = Selector::parse("a#footprintListLoadMore")?;
let text_selector = Selector::parse("div.text > p")?;
let text_rest_selector = Selector::parse("div.text > p > span.rest")?;
let mut footprints: Vec<Footprint> = Vec::new(); let mut footprints: Vec<Footprint> = Vec::new();
let mut has_more_pages = true; let mut has_more_pages = true;
@ -44,8 +42,6 @@ pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeEr
&x, &x,
&footprint_title_selector, &footprint_title_selector,
&footprint_date_selector, &footprint_date_selector,
&text_selector,
&text_rest_selector,
page, page,
) )
})); }));
@ -61,26 +57,20 @@ fn parse_footprint(
footprint_el: &ElementRef, footprint_el: &ElementRef,
footprint_title_selector: &Selector, footprint_title_selector: &Selector,
footprint_date_selector: &Selector, footprint_date_selector: &Selector,
text_selector: &Selector,
text_rest_selector: &Selector,
page: u8, page: u8,
) -> Option<Footprint> { ) -> Option<Footprint> {
let title_el = footprint_el.select(footprint_title_selector).next()?; let title_el = footprint_el.select(footprint_title_selector).next()?;
let title = title_el.text().next()?.to_string(); let title = single_text_from_el(&title_el)?;
let url = title_el.value().attr("href")?.to_string(); let url = attr_from_el(&title_el, "href")?;
let date = footprint_el let date_el = footprint_el.select(footprint_date_selector).next()?;
.select(footprint_date_selector) let date = attr_from_el(&date_el, "content")?;
.next()?
.value()
.attr("content")?;
let format = format_description::parse("[year]-[month]-[day]").ok()?; let format = format_description::parse("[year]-[month]-[day]").ok()?;
let date = Date::parse(date, &format).ok()?; let date = Date::parse(&date, &format).ok()?;
let text = if let Some(text) = footprint_el.select(text_selector).next() { let text = if let Some(text) = single_text_from_selector(footprint_el, "div.text > p") {
let text = text.text().next()?.to_string(); if let Some(text_rest) = single_text_from_selector(footprint_el, "div.text > p > span.rest")
{
if let Some(text_rest) = footprint_el.select(text_rest_selector).next() { format!("{}{}", text, text_rest)
format!("{}{}", text, text_rest.text().next()?)
} else { } else {
text text
} }
@ -90,15 +80,55 @@ fn parse_footprint(
.trim() .trim()
.to_string(); .to_string();
let images = scrape_img_links(&footprint_el).ok()?;
Some(Footprint { Some(Footprint {
title, title,
url, url,
date, date,
text, text,
page, page,
images,
}) })
} }
fn single_text_from_selector(el: &ElementRef, selector: &str) -> Option<String> {
let selector = Selector::parse(selector).ok()?;
let selected = el.select(&selector).next()?;
single_text_from_el(&selected)
}
fn single_text_from_el(el: &ElementRef) -> Option<String> {
el.text().next().map(|x| x.to_string())
}
fn attr_from_el(el: &ElementRef, attribute: &str) -> Option<String> {
Some(el.value().attr(attribute)?.to_string())
}
fn scrape_img_links(footprint_el: &ElementRef) -> Result<Vec<String>, ScrapeError> {
let main_img_selector = Selector::parse("div.images-container > a.image")?;
let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?;
let mut img_urls = Vec::new();
if let Some(main_img) = footprint_el.select(&main_img_selector).next() {
if let Some(main_img_url) = attr_from_el(&main_img, "data-url") {
img_urls.push(main_img_url);
}
}
for img in footprint_el.select(&other_img_selector) {
if let Some(main_img_url) = attr_from_el(&img, "data-url") {
img_urls.push(main_img_url);
}
}
let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect();
Ok(img_urls)
}
pub fn page_url(feed_url: &str, page: u8) -> String { pub fn page_url(feed_url: &str, page: u8) -> String {
let connector = if feed_url.contains('?') { "&" } else { "?" }; let connector = if feed_url.contains('?') { "&" } else { "?" };
format!("{}{}page={}&sort=ASC", feed_url, connector, page) format!("{}{}page={}&sort=ASC", feed_url, connector, page)