282 lines
9.4 KiB
Rust
282 lines
9.4 KiB
Rust
use std::fs;
|
|
use std::fs::DirEntry;
|
|
use std::fs::File;
|
|
use std::io::BufWriter;
|
|
use std::io::copy;
|
|
use std::path::Path;
|
|
|
|
use image::DynamicImage;
|
|
use image::GenericImageView;
|
|
use printpdf::{Image, Mm, PdfDocument};
|
|
use scraper::{Html, Selector};
|
|
use tempfile::TempDir;
|
|
|
|
use crate::error::EcloadError;
|
|
|
|
const BASEURL: &str = "https://www.e-codices.unifr.ch";
|
|
const DPI: f64 = 300.0;
|
|
const INCH_AS_MM: f64 = 25.4;
|
|
|
|
/// Provides functions to download e-codices.
|
|
pub struct EcLoader {}
|
|
|
|
arg_enum! {
|
|
#[derive(PartialEq, Debug)]
|
|
/// Defines the different sizes that e-codice pages can be.
|
|
pub enum PageFormat {
|
|
Small,
|
|
Medium,
|
|
Large,
|
|
Max,
|
|
}
|
|
}
|
|
|
|
impl PageFormat {
|
|
/// Converts the formats in the represantation that the e-codice website uses.
|
|
fn as_url_part(&self) -> &str {
|
|
match self {
|
|
PageFormat::Small => "small",
|
|
PageFormat::Medium => "medium",
|
|
PageFormat::Large => "large",
|
|
PageFormat::Max => "max",
|
|
}
|
|
}
|
|
}
|
|
|
|
impl EcLoader {
|
|
/// Download an e-codice from https://www.e-codices.unifr.ch.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003").
|
|
/// * `title` - Title of the pdf, this will also be the filename.
|
|
/// * `out_dir` - Directory where the PDF gets saved (must already exist).
|
|
/// * `format` - The size of the page to look for.
|
|
pub fn download_codice(
|
|
id: &str,
|
|
title: &str,
|
|
out_dir: &Path,
|
|
format: PageFormat,
|
|
) -> Result<(), EcloadError> {
|
|
let tmp_dir = tempfile::tempdir()?;
|
|
debug!("temporary download directory: {}", tmp_dir.path().display());
|
|
|
|
EcLoader::download_pages(id, format, &tmp_dir)
|
|
.and_then(|_| EcLoader::build_pdf(title, out_dir, &tmp_dir))
|
|
}
|
|
|
|
/// Order files in a temporary directory
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `tmp_dir`- Reference to the temporary directory.
|
|
fn get_ordered_tmp_files(tmp_dir: &TempDir) -> Result<Vec<DirEntry>, EcloadError> {
|
|
let paths = fs::read_dir(tmp_dir.path())?;
|
|
let mut sorted_paths: Vec<DirEntry> = paths.filter_map(|r| r.ok()).collect();
|
|
|
|
sorted_paths.sort_by_key(|dir| dir.path());
|
|
|
|
Ok(sorted_paths)
|
|
}
|
|
|
|
/// Calculate image dimensions in mm.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `image` - Image for the calculation.
|
|
fn mm_from_image(image: &DynamicImage) -> (Mm, Mm) {
|
|
let (width, height) = image.dimensions();
|
|
let mm_width = Mm(width as f64 * (INCH_AS_MM / DPI));
|
|
let mm_height = Mm(height as f64 * (INCH_AS_MM / DPI));
|
|
|
|
(mm_width, mm_height)
|
|
}
|
|
|
|
/// Convert an image to BMP.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `img_path` - Path to the image that should be converted.
|
|
///
|
|
/// # Remarks
|
|
///
|
|
/// The BMP file gets saved to the same directory as the original, with the same name apart from
|
|
/// the extension.
|
|
fn get_img_as_bmp(img_path: &Path) -> Result<DynamicImage, EcloadError> {
|
|
let image = image::open(img_path)?;
|
|
let img_bmp_path = img_path.with_extension("bmp");
|
|
image.save(&img_bmp_path)?;
|
|
|
|
Ok(image::open(img_bmp_path)?)
|
|
}
|
|
|
|
/// Generate a PDF from images inside a directory.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `title` - Title of the pdf, this will also be the filename.
|
|
/// * `out_dir` - Directory where the PDF gets saved (must already exist).
|
|
/// * `tmp_dir` - Reference to the temporary directory where the images for the pdf are.
|
|
fn build_pdf(title: &str, out_dir: &Path, tmp_dir: &TempDir) -> Result<(), EcloadError> {
|
|
let ordered_tmp_files = EcLoader::get_ordered_tmp_files(tmp_dir)?;
|
|
let first_file = match ordered_tmp_files.first() {
|
|
Some(file) => file,
|
|
None => return Err(EcloadError::Error("no tmp files".to_string())),
|
|
};
|
|
|
|
let first_image_data = image::open(first_file.path())?;
|
|
let (first_img_width, first_img_height) = EcLoader::mm_from_image(&first_image_data);
|
|
|
|
let (doc, page1, layer1) =
|
|
PdfDocument::new(title, first_img_width, first_img_height, "Layer 1");
|
|
let mut current_layer = doc.get_page(page1).get_layer(layer1);
|
|
|
|
let mut is_first_page = true;
|
|
let mut page_count = 1;
|
|
for entry in ordered_tmp_files {
|
|
let filename_os = entry.file_name();
|
|
let filename = match filename_os.to_str() {
|
|
Some(filename) => filename,
|
|
None => continue,
|
|
};
|
|
|
|
if !filename.ends_with(".jpg") {
|
|
continue;
|
|
}
|
|
|
|
info!("saving page {} to pdf...", page_count);
|
|
let image_bmp = match EcLoader::get_img_as_bmp(&entry.path()) {
|
|
Ok(image_data) => image_data,
|
|
Err(err) => {
|
|
error!("could not decode {}: {:?}", entry.path().display(), err);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
let mut image_file = File::open(entry.path().with_extension("bmp")).unwrap();
|
|
let image = Image::try_from(image::bmp::BMPDecoder::new(&mut image_file)).unwrap();
|
|
|
|
let (img_width, img_height) = EcLoader::mm_from_image(&image_bmp);
|
|
debug!("dimensions: {:?} x {:?}", img_width, img_height);
|
|
|
|
if !is_first_page {
|
|
let (new_page, new_layer) = doc.add_page(img_width, img_height, "Layer 1");
|
|
current_layer = doc.get_page(new_page).get_layer(new_layer);
|
|
}
|
|
|
|
image.add_to_layer(
|
|
current_layer.clone(),
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
Some(DPI),
|
|
);
|
|
|
|
is_first_page = false;
|
|
page_count += 1;
|
|
}
|
|
|
|
let pdf_file_path = out_dir.join(format!("{}.pdf", title));
|
|
info!("saved to {}", pdf_file_path.display());
|
|
let pdf_file = File::create(pdf_file_path)?;
|
|
doc.save(&mut BufWriter::new(pdf_file))?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn get_thumbnail_url(id: &str) -> String {
|
|
format!("{}/en/thumbs/{}", BASEURL, id)
|
|
}
|
|
|
|
/// Download all the pages of an e-codice to a temporary directory.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003").
|
|
/// * `format` - The size of the page to look for.
|
|
/// * `tmp_dir` - Reference to a temporary directory where the downloaded files get saved.
|
|
fn download_pages(id: &str, format: PageFormat, tmp_dir: &TempDir) -> Result<(), EcloadError> {
|
|
let thumbnail_url = EcLoader::get_thumbnail_url(id);
|
|
let mut response = reqwest::get(thumbnail_url.as_str())?;
|
|
let html = response.text()?;
|
|
let fragment = Html::parse_fragment(&html);
|
|
|
|
let li_selector = Selector::parse("div.thumbnail-image > a").unwrap();
|
|
|
|
let mut page_count = 0;
|
|
for element in fragment.select(&li_selector) {
|
|
let page_link = match element.value().attr("href") {
|
|
Some(href) => href.to_string(),
|
|
None => {
|
|
continue;
|
|
}
|
|
};
|
|
|
|
debug!("searching download links for {}...", page_link);
|
|
let download_link = match EcLoader::get_download_link(&page_link, &format) {
|
|
Ok(download_link) => download_link,
|
|
Err(err) => {
|
|
error!(
|
|
"could not find download link for {} with format {}: {:?}",
|
|
&page_link, format, err
|
|
);
|
|
continue;
|
|
}
|
|
};
|
|
debug!("found {}", download_link);
|
|
|
|
info!("downloading {}...", download_link);
|
|
let page_file_path = tmp_dir
|
|
.path()
|
|
.join(format!("{:0>5}.jpg", page_count.to_string()));
|
|
|
|
let mut page_download = match reqwest::get(&download_link) {
|
|
Ok(response) => response,
|
|
Err(err) => {
|
|
error!("could not download {}: {:?}", download_link, err);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
let mut dest = File::create(page_file_path)?;
|
|
copy(&mut page_download, &mut dest)?;
|
|
|
|
page_count += 1;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Finds the image url for a specified size for an e-codice page.
|
|
///
|
|
/// # Arguments
|
|
///
|
|
/// * `page_link` - The specific e-codice page url.
|
|
/// * `format` - The size of the page to look for.
|
|
fn get_download_link(page_link: &str, format: &PageFormat) -> Result<String, EcloadError> {
|
|
let mut response = reqwest::get(page_link)?;
|
|
let html = response.text()?;
|
|
let fragment = Html::parse_fragment(&html);
|
|
|
|
let a_selector = Selector::parse("ul.download-page-list > li > a").unwrap();
|
|
for element in fragment.select(&a_selector) {
|
|
let download_link = match element.value().attr("href") {
|
|
Some(href) => href.to_string(),
|
|
None => {
|
|
continue;
|
|
}
|
|
};
|
|
|
|
if download_link.ends_with(format.as_url_part()) {
|
|
return Ok(download_link);
|
|
}
|
|
}
|
|
|
|
Err(EcloadError::Error(
|
|
"could not find download link".to_string(),
|
|
))
|
|
}
|
|
}
|