initial commit

This commit is contained in:
Sebastian Hugentobler 2019-01-28 12:28:08 +01:00
commit 53739b78f9
13 changed files with 3272 additions and 0 deletions

33
src/error.rs Normal file
View file

@ -0,0 +1,33 @@
#[derive(Debug)]
/// Convert different errors into a common one.
pub enum EcloadError {
Error(String),
StdError(std::io::Error),
ReqwestError(reqwest::Error),
PrintPdfError(printpdf::Error),
ImageError(image::ImageError),
}
impl From<std::io::Error> for EcloadError {
fn from(e: std::io::Error) -> Self {
EcloadError::StdError(e)
}
}
impl From<reqwest::Error> for EcloadError {
fn from(e: reqwest::Error) -> Self {
EcloadError::ReqwestError(e)
}
}
impl From<printpdf::Error> for EcloadError {
fn from(e: printpdf::Error) -> Self {
EcloadError::PrintPdfError(e)
}
}
impl From<image::ImageError> for EcloadError {
fn from(e: image::ImageError) -> Self {
EcloadError::ImageError(e)
}
}

281
src/loader.rs Normal file
View file

@ -0,0 +1,281 @@
use std::fs;
use std::fs::DirEntry;
use std::fs::File;
use std::io::BufWriter;
use std::io::copy;
use std::path::Path;
use image::DynamicImage;
use image::GenericImageView;
use printpdf::{Image, Mm, PdfDocument};
use scraper::{Html, Selector};
use tempfile::TempDir;
use crate::error::EcloadError;
const BASEURL: &str = "https://www.e-codices.unifr.ch";
const DPI: f64 = 300.0;
const INCH_AS_MM: f64 = 25.4;
/// Provides functions to download e-codices.
pub struct EcLoader {}
arg_enum! {
#[derive(PartialEq, Debug)]
/// Defines the different sizes that e-codice pages can be.
pub enum PageFormat {
Small,
Medium,
Large,
Max,
}
}
impl PageFormat {
/// Converts the formats in the represantation that the e-codice website uses.
fn as_url_part(&self) -> &str {
match self {
PageFormat::Small => "small",
PageFormat::Medium => "medium",
PageFormat::Large => "large",
PageFormat::Max => "max",
}
}
}
impl EcLoader {
/// Download an e-codice from https://www.e-codices.unifr.ch.
///
/// # Arguments
///
/// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003").
/// * `title` - Title of the pdf, this will also be the filename.
/// * `out_dir` - Directory where the PDF gets saved (must already exist).
/// * `format` - The size of the page to look for.
pub fn download_codice(
id: &str,
title: &str,
out_dir: &Path,
format: PageFormat,
) -> Result<(), EcloadError> {
let tmp_dir = tempfile::tempdir()?;
debug!("temporary download directory: {}", tmp_dir.path().display());
EcLoader::download_pages(id, format, &tmp_dir)
.and_then(|_| EcLoader::build_pdf(title, out_dir, &tmp_dir))
}
/// Order files in a temporary directory
///
/// # Arguments
///
/// * `tmp_dir`- Reference to the temporary directory.
fn get_ordered_tmp_files(tmp_dir: &TempDir) -> Result<Vec<DirEntry>, EcloadError> {
let paths = fs::read_dir(tmp_dir.path())?;
let mut sorted_paths: Vec<DirEntry> = paths.filter_map(|r| r.ok()).collect();
sorted_paths.sort_by_key(|dir| dir.path());
Ok(sorted_paths)
}
/// Calculate image dimensions in mm.
///
/// # Arguments
///
/// * `image` - Image for the calculation.
fn mm_from_image(image: &DynamicImage) -> (Mm, Mm) {
let (width, height) = image.dimensions();
let mm_width = Mm(width as f64 * (INCH_AS_MM / DPI));
let mm_height = Mm(height as f64 * (INCH_AS_MM / DPI));
(mm_width, mm_height)
}
/// Convert an image to BMP.
///
/// # Arguments
///
/// * `img_path` - Path to the image that should be converted.
///
/// # Remarks
///
/// The BMP file gets saved to the same directory as the original, with the same name apart from
/// the extension.
fn get_img_as_bmp(img_path: &Path) -> Result<DynamicImage, EcloadError> {
let image = image::open(img_path)?;
let img_bmp_path = img_path.with_extension("bmp");
image.save(&img_bmp_path)?;
Ok(image::open(img_bmp_path)?)
}
/// Generate a PDF from images inside a directory.
///
/// # Arguments
///
/// * `title` - Title of the pdf, this will also be the filename.
/// * `out_dir` - Directory where the PDF gets saved (must already exist).
/// * `tmp_dir` - Reference to the temporary directory where the images for the pdf are.
fn build_pdf(title: &str, out_dir: &Path, tmp_dir: &TempDir) -> Result<(), EcloadError> {
let ordered_tmp_files = EcLoader::get_ordered_tmp_files(tmp_dir)?;
let first_file = match ordered_tmp_files.first() {
Some(file) => file,
None => return Err(EcloadError::Error("no tmp files".to_string())),
};
let first_image_data = image::open(first_file.path())?;
let (first_img_width, first_img_height) = EcLoader::mm_from_image(&first_image_data);
let (doc, page1, layer1) =
PdfDocument::new(title, first_img_width, first_img_height, "Layer 1");
let mut current_layer = doc.get_page(page1).get_layer(layer1);
let mut is_first_page = true;
let mut page_count = 1;
for entry in ordered_tmp_files {
let filename_os = entry.file_name();
let filename = match filename_os.to_str() {
Some(filename) => filename,
None => continue,
};
if !filename.ends_with(".jpg") {
continue;
}
info!("saving page {} to pdf...", page_count);
let image_bmp = match EcLoader::get_img_as_bmp(&entry.path()) {
Ok(image_data) => image_data,
Err(err) => {
error!("could not decode {}: {:?}", entry.path().display(), err);
continue;
}
};
let mut image_file = File::open(entry.path().with_extension("bmp")).unwrap();
let image = Image::try_from(image::bmp::BMPDecoder::new(&mut image_file)).unwrap();
let (img_width, img_height) = EcLoader::mm_from_image(&image_bmp);
debug!("dimensions: {:?} x {:?}", img_width, img_height);
if !is_first_page {
let (new_page, new_layer) = doc.add_page(img_width, img_height, "Layer 1");
current_layer = doc.get_page(new_page).get_layer(new_layer);
}
image.add_to_layer(
current_layer.clone(),
None,
None,
None,
None,
None,
Some(DPI),
);
is_first_page = false;
page_count += 1;
}
let pdf_file_path = out_dir.join(format!("{}.pdf", title));
info!("saved to {}", pdf_file_path.display());
let pdf_file = File::create(pdf_file_path)?;
doc.save(&mut BufWriter::new(pdf_file))?;
Ok(())
}
fn get_thumbnail_url(id: &str) -> String {
format!("{}/en/thumbs/{}", BASEURL, id)
}
/// Download all the pages of an e-codice to a temporary directory.
///
/// # Arguments
///
/// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003").
/// * `format` - The size of the page to look for.
/// * `tmp_dir` - Reference to a temporary directory where the downloaded files get saved.
fn download_pages(id: &str, format: PageFormat, tmp_dir: &TempDir) -> Result<(), EcloadError> {
let thumbnail_url = EcLoader::get_thumbnail_url(id);
let mut response = reqwest::get(thumbnail_url.as_str())?;
let html = response.text()?;
let fragment = Html::parse_fragment(&html);
let li_selector = Selector::parse("div.thumbnail-image > a").unwrap();
let mut page_count = 0;
for element in fragment.select(&li_selector) {
let page_link = match element.value().attr("href") {
Some(href) => href.to_string(),
None => {
continue;
}
};
debug!("searching download links for {}...", page_link);
let download_link = match EcLoader::get_download_link(&page_link, &format) {
Ok(download_link) => download_link,
Err(err) => {
error!(
"could not find download link for {} with format {}: {:?}",
&page_link, format, err
);
continue;
}
};
debug!("found {}", download_link);
info!("downloading {}...", download_link);
let page_file_path = tmp_dir
.path()
.join(format!("{:0>5}.jpg", page_count.to_string()));
let mut page_download = match reqwest::get(&download_link) {
Ok(response) => response,
Err(err) => {
error!("could not download {}: {:?}", download_link, err);
continue;
}
};
let mut dest = File::create(page_file_path)?;
copy(&mut page_download, &mut dest)?;
page_count += 1;
}
Ok(())
}
/// Finds the image url for a specified size for an e-codice page.
///
/// # Arguments
///
/// * `page_link` - The specific e-codice page url.
/// * `format` - The size of the page to look for.
fn get_download_link(page_link: &str, format: &PageFormat) -> Result<String, EcloadError> {
let mut response = reqwest::get(page_link)?;
let html = response.text()?;
let fragment = Html::parse_fragment(&html);
let a_selector = Selector::parse("ul.download-page-list > li > a").unwrap();
for element in fragment.select(&a_selector) {
let download_link = match element.value().attr("href") {
Some(href) => href.to_string(),
None => {
continue;
}
};
if download_link.ends_with(format.as_url_part()) {
return Ok(download_link);
}
}
Err(EcloadError::Error(
"could not find download link".to_string(),
))
}
}

84
src/main.rs Normal file
View file

@ -0,0 +1,84 @@
#[macro_use]
extern crate clap;
extern crate image;
#[macro_use]
extern crate log;
extern crate printpdf;
extern crate reqwest;
extern crate scraper;
extern crate simple_logger;
extern crate tempfile;
use std::path::Path;
use clap::{App, AppSettings, Arg};
use crate::error::EcloadError;
use crate::loader::EcLoader;
pub mod error;
pub mod loader;
fn main() {
simple_logger::init_with_level(log::Level::Info).unwrap();
let app = App::new("ecload")
.version("0.1.0")
.author("Sebastian Hugentobler <sebastian@vanwa.ch>")
.about("Download books from https://www.e-codices.unifr.ch")
.arg(
Arg::with_name("output directory")
.short("o")
.long("out-dir")
.value_name("DIR")
.help("The directory where the resulting pdf is saved.")
.takes_value(true)
.default_value("."),
)
.arg(
Arg::with_name("size")
.short("s")
.long("size")
.value_name("SIZE")
.help("Sets the size of the downloaded images.")
.takes_value(true)
.possible_values(&loader::PageFormat::variants())
.default_value("medium")
.case_insensitive(true),
)
.arg(
Arg::with_name("id")
.short("i")
.long("id")
.value_name("ID")
.help("Id of the book to download. Copy the last to url parts on the overview page to get it (for example bbb/0003).")
.takes_value(true),
)
.setting(AppSettings::ArgRequiredElseHelp);
let matches = app.clone().get_matches();
let out_path_str = matches.value_of("output directory").unwrap();
let out_path = Path::new(out_path_str);
let size = value_t!(matches.value_of("size"), loader::PageFormat).unwrap();
let id = match matches.value_of("id") {
Some(id) => id,
None => {
println!("ID is a required argument:");
println!();
app.clone().print_help().ok();
return;
}
};
let title = &id.replace("/", "_");
match run(&out_path, size, id, title) {
Ok(_) => info!("finished!"),
Err(e) => error!("something went wrong: {:?}", e),
};
}
fn run(out_path: &Path, format: loader::PageFormat, id: &str, title: &str) -> Result<(), EcloadError> {
EcLoader::download_codice(id, title, out_path, format)
}