ecload/src/loader.rs

282 lines
9.4 KiB
Rust

use std::fs;
use std::fs::DirEntry;
use std::fs::File;
use std::io::BufWriter;
use std::io::copy;
use std::path::Path;
use image::DynamicImage;
use image::GenericImageView;
use printpdf::{Image, Mm, PdfDocument};
use scraper::{Html, Selector};
use tempfile::TempDir;
use crate::error::EcloadError;
const BASEURL: &str = "https://www.e-codices.unifr.ch";
const DPI: f64 = 300.0;
const INCH_AS_MM: f64 = 25.4;
/// Provides functions to download e-codices.
pub struct EcLoader {}
arg_enum! {
#[derive(PartialEq, Debug)]
/// Defines the different sizes that e-codice pages can be.
pub enum PageFormat {
Small,
Medium,
Large,
Max,
}
}
impl PageFormat {
/// Converts the formats in the represantation that the e-codice website uses.
fn as_url_part(&self) -> &str {
match self {
PageFormat::Small => "small",
PageFormat::Medium => "medium",
PageFormat::Large => "large",
PageFormat::Max => "max",
}
}
}
impl EcLoader {
/// Download an e-codice from https://www.e-codices.unifr.ch.
///
/// # Arguments
///
/// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003").
/// * `title` - Title of the pdf, this will also be the filename.
/// * `out_dir` - Directory where the PDF gets saved (must already exist).
/// * `format` - The size of the page to look for.
pub fn download_codice(
id: &str,
title: &str,
out_dir: &Path,
format: PageFormat,
) -> Result<(), EcloadError> {
let tmp_dir = tempfile::tempdir()?;
debug!("temporary download directory: {}", tmp_dir.path().display());
EcLoader::download_pages(id, format, &tmp_dir)
.and_then(|_| EcLoader::build_pdf(title, out_dir, &tmp_dir))
}
/// Order files in a temporary directory
///
/// # Arguments
///
/// * `tmp_dir`- Reference to the temporary directory.
fn get_ordered_tmp_files(tmp_dir: &TempDir) -> Result<Vec<DirEntry>, EcloadError> {
let paths = fs::read_dir(tmp_dir.path())?;
let mut sorted_paths: Vec<DirEntry> = paths.filter_map(|r| r.ok()).collect();
sorted_paths.sort_by_key(|dir| dir.path());
Ok(sorted_paths)
}
/// Calculate image dimensions in mm.
///
/// # Arguments
///
/// * `image` - Image for the calculation.
fn mm_from_image(image: &DynamicImage) -> (Mm, Mm) {
let (width, height) = image.dimensions();
let mm_width = Mm(width as f64 * (INCH_AS_MM / DPI));
let mm_height = Mm(height as f64 * (INCH_AS_MM / DPI));
(mm_width, mm_height)
}
/// Convert an image to BMP.
///
/// # Arguments
///
/// * `img_path` - Path to the image that should be converted.
///
/// # Remarks
///
/// The BMP file gets saved to the same directory as the original, with the same name apart from
/// the extension.
fn get_img_as_bmp(img_path: &Path) -> Result<DynamicImage, EcloadError> {
let image = image::open(img_path)?;
let img_bmp_path = img_path.with_extension("bmp");
image.save(&img_bmp_path)?;
Ok(image::open(img_bmp_path)?)
}
/// Generate a PDF from images inside a directory.
///
/// # Arguments
///
/// * `title` - Title of the pdf, this will also be the filename.
/// * `out_dir` - Directory where the PDF gets saved (must already exist).
/// * `tmp_dir` - Reference to the temporary directory where the images for the pdf are.
fn build_pdf(title: &str, out_dir: &Path, tmp_dir: &TempDir) -> Result<(), EcloadError> {
let ordered_tmp_files = EcLoader::get_ordered_tmp_files(tmp_dir)?;
let first_file = match ordered_tmp_files.first() {
Some(file) => file,
None => return Err(EcloadError::Error("no tmp files".to_string())),
};
let first_image_data = image::open(first_file.path())?;
let (first_img_width, first_img_height) = EcLoader::mm_from_image(&first_image_data);
let (doc, page1, layer1) =
PdfDocument::new(title, first_img_width, first_img_height, "Layer 1");
let mut current_layer = doc.get_page(page1).get_layer(layer1);
let mut is_first_page = true;
let mut page_count = 1;
for entry in ordered_tmp_files {
let filename_os = entry.file_name();
let filename = match filename_os.to_str() {
Some(filename) => filename,
None => continue,
};
if !filename.ends_with(".jpg") {
continue;
}
info!("saving page {} to pdf...", page_count);
let image_bmp = match EcLoader::get_img_as_bmp(&entry.path()) {
Ok(image_data) => image_data,
Err(err) => {
error!("could not decode {}: {}", entry.path().display(), err);
continue;
}
};
let mut image_file = File::open(entry.path().with_extension("bmp")).unwrap();
let image = Image::try_from(image::bmp::BMPDecoder::new(&mut image_file)).unwrap();
let (img_width, img_height) = EcLoader::mm_from_image(&image_bmp);
debug!("dimensions: {:?} x {:?}", img_width, img_height);
if !is_first_page {
let (new_page, new_layer) = doc.add_page(img_width, img_height, "Layer 1");
current_layer = doc.get_page(new_page).get_layer(new_layer);
}
image.add_to_layer(
current_layer.clone(),
None,
None,
None,
None,
None,
Some(DPI),
);
is_first_page = false;
page_count += 1;
}
let pdf_file_path = out_dir.join(format!("{}.pdf", title));
info!("saving to to {}...", pdf_file_path.display());
let pdf_file = File::create(pdf_file_path)?;
doc.save(&mut BufWriter::new(pdf_file))?;
Ok(())
}
fn get_thumbnail_url(id: &str) -> String {
format!("{}/en/thumbs/{}", BASEURL, id)
}
/// Download all the pages of an e-codice to a temporary directory.
///
/// # Arguments
///
/// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003").
/// * `format` - The size of the page to look for.
/// * `tmp_dir` - Reference to a temporary directory where the downloaded files get saved.
fn download_pages(id: &str, format: PageFormat, tmp_dir: &TempDir) -> Result<(), EcloadError> {
let thumbnail_url = EcLoader::get_thumbnail_url(id);
let mut response = reqwest::get(thumbnail_url.as_str())?;
let html = response.text()?;
let fragment = Html::parse_fragment(&html);
let li_selector = Selector::parse("div.thumbnail-image > a").unwrap();
let mut page_count = 0;
for element in fragment.select(&li_selector) {
let page_link = match element.value().attr("href") {
Some(href) => href.to_string(),
None => {
continue;
}
};
debug!("searching download links for {}...", page_link);
let download_link = match EcLoader::get_download_link(&page_link, &format) {
Ok(download_link) => download_link,
Err(err) => {
error!(
"could not find download link for {} with format {}: {}",
&page_link, format, err
);
continue;
}
};
debug!("found {}", download_link);
info!("downloading {}...", download_link);
let page_file_path = tmp_dir
.path()
.join(format!("{:0>5}.jpg", page_count.to_string()));
let mut page_download = match reqwest::get(&download_link) {
Ok(response) => response,
Err(err) => {
error!("could not download {}: {}", download_link, err);
continue;
}
};
let mut dest = File::create(page_file_path)?;
copy(&mut page_download, &mut dest)?;
page_count += 1;
}
Ok(())
}
/// Finds the image url for a specified size for an e-codice page.
///
/// # Arguments
///
/// * `page_link` - The specific e-codice page url.
/// * `format` - The size of the page to look for.
fn get_download_link(page_link: &str, format: &PageFormat) -> Result<String, EcloadError> {
let mut response = reqwest::get(page_link)?;
let html = response.text()?;
let fragment = Html::parse_fragment(&html);
let a_selector = Selector::parse("ul.download-page-list > li > a").unwrap();
for element in fragment.select(&a_selector) {
let download_link = match element.value().attr("href") {
Some(href) => href.to_string(),
None => {
continue;
}
};
if download_link.ends_with(format.as_url_part()) {
return Ok(download_link);
}
}
Err(EcloadError::Error(
"could not find download link".to_string(),
))
}
}