use std::fs; use std::fs::DirEntry; use std::fs::File; use std::io::BufWriter; use std::io::copy; use std::path::Path; use image::DynamicImage; use image::GenericImageView; use printpdf::{Image, Mm, PdfDocument}; use scraper::{Html, Selector}; use tempfile::TempDir; use crate::error::EcloadError; const BASEURL: &str = "https://www.e-codices.unifr.ch"; const DPI: f64 = 300.0; const INCH_AS_MM: f64 = 25.4; /// Provides functions to download e-codices. pub struct EcLoader {} arg_enum! { #[derive(PartialEq, Debug)] /// Defines the different sizes that e-codice pages can be. pub enum PageFormat { Small, Medium, Large, Max, } } impl PageFormat { /// Converts the formats in the represantation that the e-codice website uses. fn as_url_part(&self) -> &str { match self { PageFormat::Small => "small", PageFormat::Medium => "medium", PageFormat::Large => "large", PageFormat::Max => "max", } } } impl EcLoader { /// Download an e-codice from https://www.e-codices.unifr.ch. /// /// # Arguments /// /// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003"). /// * `title` - Title of the pdf, this will also be the filename. /// * `out_dir` - Directory where the PDF gets saved (must already exist). /// * `format` - The size of the page to look for. pub fn download_codice( id: &str, title: &str, out_dir: &Path, format: PageFormat, ) -> Result<(), EcloadError> { let tmp_dir = tempfile::tempdir()?; debug!("temporary download directory: {}", tmp_dir.path().display()); EcLoader::download_pages(id, format, &tmp_dir) .and_then(|_| EcLoader::build_pdf(title, out_dir, &tmp_dir)) } /// Order files in a temporary directory /// /// # Arguments /// /// * `tmp_dir`- Reference to the temporary directory. fn get_ordered_tmp_files(tmp_dir: &TempDir) -> Result, EcloadError> { let paths = fs::read_dir(tmp_dir.path())?; let mut sorted_paths: Vec = paths.filter_map(|r| r.ok()).collect(); sorted_paths.sort_by_key(|dir| dir.path()); Ok(sorted_paths) } /// Calculate image dimensions in mm. /// /// # Arguments /// /// * `image` - Image for the calculation. fn mm_from_image(image: &DynamicImage) -> (Mm, Mm) { let (width, height) = image.dimensions(); let mm_width = Mm(width as f64 * (INCH_AS_MM / DPI)); let mm_height = Mm(height as f64 * (INCH_AS_MM / DPI)); (mm_width, mm_height) } /// Convert an image to BMP. /// /// # Arguments /// /// * `img_path` - Path to the image that should be converted. /// /// # Remarks /// /// The BMP file gets saved to the same directory as the original, with the same name apart from /// the extension. fn get_img_as_bmp(img_path: &Path) -> Result { let image = image::open(img_path)?; let img_bmp_path = img_path.with_extension("bmp"); image.save(&img_bmp_path)?; Ok(image::open(img_bmp_path)?) } /// Generate a PDF from images inside a directory. /// /// # Arguments /// /// * `title` - Title of the pdf, this will also be the filename. /// * `out_dir` - Directory where the PDF gets saved (must already exist). /// * `tmp_dir` - Reference to the temporary directory where the images for the pdf are. fn build_pdf(title: &str, out_dir: &Path, tmp_dir: &TempDir) -> Result<(), EcloadError> { let ordered_tmp_files = EcLoader::get_ordered_tmp_files(tmp_dir)?; let first_file = match ordered_tmp_files.first() { Some(file) => file, None => return Err(EcloadError::Error("no tmp files".to_string())), }; let first_image_data = image::open(first_file.path())?; let (first_img_width, first_img_height) = EcLoader::mm_from_image(&first_image_data); let (doc, page1, layer1) = PdfDocument::new(title, first_img_width, first_img_height, "Layer 1"); let mut current_layer = doc.get_page(page1).get_layer(layer1); let mut is_first_page = true; let mut page_count = 1; for entry in ordered_tmp_files { let filename_os = entry.file_name(); let filename = match filename_os.to_str() { Some(filename) => filename, None => continue, }; if !filename.ends_with(".jpg") { continue; } info!("saving page {} to pdf...", page_count); let image_bmp = match EcLoader::get_img_as_bmp(&entry.path()) { Ok(image_data) => image_data, Err(err) => { error!("could not decode {}: {:?}", entry.path().display(), err); continue; } }; let mut image_file = File::open(entry.path().with_extension("bmp")).unwrap(); let image = Image::try_from(image::bmp::BMPDecoder::new(&mut image_file)).unwrap(); let (img_width, img_height) = EcLoader::mm_from_image(&image_bmp); debug!("dimensions: {:?} x {:?}", img_width, img_height); if !is_first_page { let (new_page, new_layer) = doc.add_page(img_width, img_height, "Layer 1"); current_layer = doc.get_page(new_page).get_layer(new_layer); } image.add_to_layer( current_layer.clone(), None, None, None, None, None, Some(DPI), ); is_first_page = false; page_count += 1; } let pdf_file_path = out_dir.join(format!("{}.pdf", title)); info!("saved to {}", pdf_file_path.display()); let pdf_file = File::create(pdf_file_path)?; doc.save(&mut BufWriter::new(pdf_file))?; Ok(()) } fn get_thumbnail_url(id: &str) -> String { format!("{}/en/thumbs/{}", BASEURL, id) } /// Download all the pages of an e-codice to a temporary directory. /// /// # Arguments /// /// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003"). /// * `format` - The size of the page to look for. /// * `tmp_dir` - Reference to a temporary directory where the downloaded files get saved. fn download_pages(id: &str, format: PageFormat, tmp_dir: &TempDir) -> Result<(), EcloadError> { let thumbnail_url = EcLoader::get_thumbnail_url(id); let mut response = reqwest::get(thumbnail_url.as_str())?; let html = response.text()?; let fragment = Html::parse_fragment(&html); let li_selector = Selector::parse("div.thumbnail-image > a").unwrap(); let mut page_count = 0; for element in fragment.select(&li_selector) { let page_link = match element.value().attr("href") { Some(href) => href.to_string(), None => { continue; } }; debug!("searching download links for {}...", page_link); let download_link = match EcLoader::get_download_link(&page_link, &format) { Ok(download_link) => download_link, Err(err) => { error!( "could not find download link for {} with format {}: {:?}", &page_link, format, err ); continue; } }; debug!("found {}", download_link); info!("downloading {}...", download_link); let page_file_path = tmp_dir .path() .join(format!("{:0>5}.jpg", page_count.to_string())); let mut page_download = match reqwest::get(&download_link) { Ok(response) => response, Err(err) => { error!("could not download {}: {:?}", download_link, err); continue; } }; let mut dest = File::create(page_file_path)?; copy(&mut page_download, &mut dest)?; page_count += 1; } Ok(()) } /// Finds the image url for a specified size for an e-codice page. /// /// # Arguments /// /// * `page_link` - The specific e-codice page url. /// * `format` - The size of the page to look for. fn get_download_link(page_link: &str, format: &PageFormat) -> Result { let mut response = reqwest::get(page_link)?; let html = response.text()?; let fragment = Html::parse_fragment(&html); let a_selector = Selector::parse("ul.download-page-list > li > a").unwrap(); for element in fragment.select(&a_selector) { let download_link = match element.value().attr("href") { Some(href) => href.to_string(), None => { continue; } }; if download_link.ends_with(format.as_url_part()) { return Ok(download_link); } } Err(EcloadError::Error( "could not find download link".to_string(), )) } }