initial commit

This commit is contained in:
Sebastian Hugentobler 2023-11-15 16:26:35 +01:00
commit 2a9f427bc7
Signed by: shu
GPG key ID: BB32CF3CA052C2F0
21 changed files with 3692 additions and 0 deletions

21
src/app.rs Normal file
View file

@ -0,0 +1,21 @@
use std::collections::HashMap;
use std::sync::Arc;
use axum::Router;
use tower_http::trace::TraceLayer;
use crate::{feeds::Feed, routes};
#[derive(Debug)]
pub struct AppState {
pub feeds: HashMap<String, Feed>,
}
pub fn create(feeds: HashMap<String, Feed>) -> Router {
let app_state = Arc::new(AppState { feeds });
Router::new()
.nest("/", routes::all())
.with_state(app_state)
.layer(TraceLayer::new_for_http())
}

79
src/config.rs Normal file
View file

@ -0,0 +1,79 @@
use std::path::PathBuf;
use config::{Config, ConfigError, Environment};
use directories::ProjectDirs;
use serde::Deserialize;
use thiserror::Error;
use tracing::debug;
#[derive(Error, Debug)]
pub enum AppConfigError {
#[error("No valid home directory found.")]
HomePathError,
#[error("Configuration error.")]
ConfigError(#[from] ConfigError),
}
fn default_server_address() -> String {
String::from("[::1]:3000")
}
fn default_feeds() -> Vec<String> {
Vec::new()
}
#[derive(Clone, Debug, Default, Deserialize)]
pub struct PenguinConfig {
#[serde(default = "default_server_address")]
pub server_address: String,
#[serde(default = "default_feeds")]
pub feeds: Vec<String>,
}
impl PenguinConfig {
/// Find the correct project directory for the respective operating system.
fn get_project_dirs() -> Result<ProjectDirs, AppConfigError> {
let project_dirs = ProjectDirs::from("ch", "vanwa", "findpenguins-feed")
.ok_or(AppConfigError::HomePathError)?;
Ok(project_dirs)
}
/// Find the correct configuration file for the respective operating system.
/// - linux: `$XDG_CONFIG_HOME/stray/config.toml` or `$HOME/.config/stray/config.toml`
/// - macOS: `$HOME/Library/Preferences/ch.fhnw.stray/config.toml`
/// - windows: `{FOLDERID_RoamingAppData}/stray/config/config.toml`
fn config_path() -> Result<PathBuf, AppConfigError> {
let project_dirs = Self::get_project_dirs()?;
let config_path = project_dirs.config_dir();
let config_file = config_path.join("config.toml");
Ok(config_file)
}
/// Read the server configuration.
/// See [config_path](struct.AppConfig.html#method.config_path) for the different configuration file paths.
/// Everything in the config file can be overridden at runtime by setting the respective ENV variables.
/// Use `__` (double underscore) as a separator between hierarchies.
/// For example `WOWEB__INITIAL_REMOTES="http://[::1]:3002,http://[::1]:3003"`
/// This is necessary for configuration keys to have single underscores.
pub fn read() -> Result<Self, AppConfigError> {
let mut config_builder = Config::builder();
if let Ok(config_path) = Self::config_path() {
debug!("trying config file at {:#?}...", config_path);
config_builder =
config_builder.add_source(config::File::from(config_path).required(false));
}
debug!("merging settings from environment variables...");
config_builder = config_builder.add_source(
Environment::with_prefix("findpenguins_feed")
.separator("__")
.list_separator(",")
.with_list_parse_key("feeds")
.try_parsing(true),
);
Ok(config_builder.build()?.try_deserialize()?)
}
}

74
src/feeds/mod.rs Normal file
View file

@ -0,0 +1,74 @@
use std::collections::HashMap;
use std::fmt;
use futures::future::join_all;
use rss::{GuidBuilder, Item, ItemBuilder};
use serde::Deserialize;
use time::format_description::well_known::iso8601::FormattedComponents;
use time::format_description::well_known::{iso8601, Iso8601};
use time::Date;
use crate::scrapers::page_url;
use crate::{hash, scrapers};
pub mod route;
pub mod template;
#[derive(Clone, Debug, Deserialize)]
pub struct Feed {
pub title: String,
pub url: String,
pub id: String,
}
impl fmt::Display for Feed {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{} ({})", self.title, self.url)
}
}
#[derive(Clone, Debug, Deserialize)]
pub struct Footprint {
pub title: String,
pub text: String,
pub url: String,
pub date: Date,
pub page: u8,
}
const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
.set_formatted_components(FormattedComponents::Date)
.encode();
impl Footprint {
pub fn into_rss_item(self, root_url: &str) -> Item {
let desc = format!(
"{} <br /><br /> --- <br /><br /> from {}",
self.text,
page_url(root_url, self.page)
);
ItemBuilder::default()
.title(Some(self.title))
.pub_date(self.date.format(&Iso8601::<ISO8601_DATE>).ok())
.link(Some(self.url.clone()))
.description(Some(desc))
.guid(Some(GuidBuilder::default().value(self.url).build()))
.build()
}
}
pub async fn enrich_feeds(feed_urls: Vec<String>) -> HashMap<String, Feed> {
let feeds = join_all(feed_urls.iter().map(|feed_url| async {
let title = scrapers::feed_title(feed_url).await.unwrap();
let id = hash::fnv_str(feed_url);
Feed {
title,
url: feed_url.clone(),
id,
}
}))
.await;
feeds.iter().map(|x| (x.id.clone(), x.clone())).collect()
}

46
src/feeds/route.rs Normal file
View file

@ -0,0 +1,46 @@
use std::sync::Arc;
use axum::extract::Path;
use axum::http::StatusCode;
use axum::{extract::State, response::IntoResponse};
use rss::{ChannelBuilder, Item};
use crate::scrapers::fetch_footprints;
use crate::{app::AppState, templates::HtmlTemplate};
use super::template::FeedsTemplate;
pub async fn feeds(State(state): State<Arc<AppState>>) -> impl IntoResponse {
let template = FeedsTemplate {
feeds: state.feeds.values().cloned().collect(),
};
HtmlTemplate(template)
}
pub async fn feed(
State(state): State<Arc<AppState>>,
Path(feed_id): Path<String>,
) -> Result<impl IntoResponse, StatusCode> {
let feed = state.feeds.get(&feed_id).ok_or(StatusCode::NOT_FOUND)?;
let footprints = fetch_footprints(&feed.url)
.await
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
let footprints: Vec<Item> = footprints
.iter()
.map(|x| x.clone().into_rss_item(&feed.url))
.collect();
let channel = ChannelBuilder::default()
.title(&feed.title)
.link(&feed.url)
.description(format!("FindPenguins: {}", &feed.title))
.items(footprints)
.build();
Ok((
StatusCode::OK,
[("Content-Type", "application/rss+xml")],
channel.to_string(),
))
}

9
src/feeds/template.rs Normal file
View file

@ -0,0 +1,9 @@
use askama::Template;
use super::Feed;
#[derive(Template)]
#[template(path = "feeds.html")]
pub struct FeedsTemplate {
pub feeds: Vec<Feed>,
}

12
src/hash.rs Normal file
View file

@ -0,0 +1,12 @@
use std::hash::Hasher;
use fnv::FnvHasher;
pub fn fnv_str(input: &str) -> String {
let mut hasher = FnvHasher::default();
hasher.write(input.as_bytes());
let hash = hasher.finish();
let short_hash = format!("{:x}", hash);
short_hash
}

14
src/logging.rs Normal file
View file

@ -0,0 +1,14 @@
use tracing::debug;
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt};
pub(crate) fn setup(bin_name: &str) {
let default_config = format!("{}=debug,tower_http=debug,axum::rejection=trace", bin_name);
tracing_subscriber::registry()
.with(
tracing_subscriber::EnvFilter::try_from_default_env()
.unwrap_or_else(|_| default_config.into()),
)
.with(tracing_subscriber::fmt::layer().with_target(true))
.init();
debug!("tracing/logging is setup");
}

45
src/main.rs Normal file
View file

@ -0,0 +1,45 @@
use std::net::SocketAddr;
use crate::{config::PenguinConfig, feeds::enrich_feeds};
mod app;
mod config;
mod feeds;
mod hash;
mod logging;
mod routes;
mod scrapers;
mod signals;
mod templates;
#[tokio::main]
async fn main() {
logging::setup("findpenguins_feed");
let config = PenguinConfig::read().expect("Failed to read application configuration");
let addr = get_addr(&config);
let feeds = enrich_feeds(config.feeds).await;
let app = app::create(feeds);
tracing::debug!("Listening on {}", addr);
axum::Server::bind(&addr)
.serve(app.into_make_service())
.with_graceful_shutdown(signals::shutdown())
.await
.expect("Failed to start server");
}
/// Retrieve the address to listen on.
///
/// # Returns
///
/// A `SocketAddr` object that represents the address to listen on.
///
/// # Panics
///
/// If the `host` string cannot be parsed into a `SocketAddr` object.
fn get_addr(config: &PenguinConfig) -> SocketAddr {
config
.server_address
.parse()
.expect("No proper address to listen on: {}")
}

11
src/routes.rs Normal file
View file

@ -0,0 +1,11 @@
use std::sync::Arc;
use axum::{body::Body, routing::get, Router};
use crate::{app::AppState, feeds};
pub fn all() -> Router<Arc<AppState>, Body> {
Router::new()
.route("/", get(feeds::route::feeds))
.route("/feeds/:id", get(feeds::route::feed))
}

105
src/scrapers.rs Normal file
View file

@ -0,0 +1,105 @@
use crate::feeds::Footprint;
use scraper::{ElementRef, Html, Selector};
use thiserror::Error;
use time::{format_description, Date};
#[derive(Error, Debug)]
pub enum ScrapeError {
#[error("An error occurred fetching a document.")]
FetchError(#[from] reqwest::Error),
#[error("An error occurred constructing a selector.")]
SelectorError(#[from] scraper::error::SelectorErrorKind<'static>),
}
pub async fn feed_title(feed_url: &str) -> Result<String, ScrapeError> {
let resp = reqwest::get(feed_url).await?.text().await?;
let doc = Html::parse_document(&resp);
let title_selector = Selector::parse("span.placeholder")?;
let title = doc
.select(&title_selector)
.find_map(|x| x.text().next())
.unwrap_or("no title");
Ok(title.to_string())
}
pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeError> {
let footprint_selector = Selector::parse("li.footprint div.footprint-container")?;
let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?;
let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?;
let more_selector = Selector::parse("a#footprintListLoadMore")?;
let text_selector = Selector::parse("div.text > p")?;
let text_rest_selector = Selector::parse("div.text > p > span.rest")?;
let mut footprints: Vec<Footprint> = Vec::new();
let mut has_more_pages = true;
let mut page = 1u8;
while has_more_pages {
let feed_url = page_url(feed_url, page);
let resp = reqwest::get(feed_url).await?.text().await?;
let doc = Html::parse_document(&resp);
footprints.extend(doc.select(&footprint_selector).flat_map(|x| {
parse_footprint(
&x,
&footprint_title_selector,
&footprint_date_selector,
&text_selector,
&text_rest_selector,
page,
)
}));
has_more_pages = doc.select(&more_selector).next().is_some();
page += 1;
}
Ok(footprints)
}
fn parse_footprint(
footprint_el: &ElementRef,
footprint_title_selector: &Selector,
footprint_date_selector: &Selector,
text_selector: &Selector,
text_rest_selector: &Selector,
page: u8,
) -> Option<Footprint> {
let title_el = footprint_el.select(footprint_title_selector).next()?;
let title = title_el.text().next()?.to_string();
let url = title_el.value().attr("href")?.to_string();
let date = footprint_el
.select(footprint_date_selector)
.next()?
.value()
.attr("content")?;
let format = format_description::parse("[year]-[month]-[day]").ok()?;
let date = Date::parse(date, &format).ok()?;
let text = footprint_el
.select(text_selector)
.next()?
.text()
.next()?
.to_string();
let text = if let Some(text_rest) = footprint_el.select(text_rest_selector).next() {
format!("{}{}", text, text_rest.text().next()?)
} else {
text
}
.trim()
.to_string();
Some(Footprint {
title,
url,
date,
text,
page,
})
}
pub fn page_url(feed_url: &str, page: u8) -> String {
let connector = if feed_url.contains('?') { "&" } else { "?" };
format!("{}{}page={}&sort=ASC", feed_url, connector, page)
}

32
src/signals.rs Normal file
View file

@ -0,0 +1,32 @@
use tokio::signal;
use tracing::debug;
///
/// Handle SIGTERM and SIGINT on our own. This is needed for the process to behave properly when
/// running as PID 1 (as is the case as the sole thing in a container).
///
/// Includes configuration for non-unix systems but that is untested as well as not expected to be
/// used.
pub(crate) async fn shutdown() {
let ctrl_c = async {
signal::ctrl_c()
.await
.expect("failed to install Ctrl+C handler");
};
#[cfg(unix)]
let terminate = async {
signal::unix::signal(signal::unix::SignalKind::terminate())
.expect("failed to install signal handler")
.recv()
.await;
};
#[cfg(not(unix))]
let terminate = std::future::pending::<()>();
tokio::select! {
_ = ctrl_c => {},
_ = terminate => {},
}
debug!("signal received, shutting down...");
}

23
src/templates.rs Normal file
View file

@ -0,0 +1,23 @@
use askama::Template;
use axum::{
http::StatusCode,
response::{Html, IntoResponse, Response},
};
pub struct HtmlTemplate<T>(pub T);
impl<T> IntoResponse for HtmlTemplate<T>
where
T: Template,
{
fn into_response(self) -> Response {
match self.0.render() {
Ok(html) => Html(html).into_response(),
Err(err) => (
StatusCode::INTERNAL_SERVER_ERROR,
format!("Failed to render template. Error: {}", err),
)
.into_response(),
}
}
}