initial commit
This commit is contained in:
commit
53739b78f9
13 changed files with 3272 additions and 0 deletions
33
src/error.rs
Normal file
33
src/error.rs
Normal file
|
@ -0,0 +1,33 @@
|
|||
#[derive(Debug)]
|
||||
/// Convert different errors into a common one.
|
||||
pub enum EcloadError {
|
||||
Error(String),
|
||||
StdError(std::io::Error),
|
||||
ReqwestError(reqwest::Error),
|
||||
PrintPdfError(printpdf::Error),
|
||||
ImageError(image::ImageError),
|
||||
}
|
||||
|
||||
impl From<std::io::Error> for EcloadError {
|
||||
fn from(e: std::io::Error) -> Self {
|
||||
EcloadError::StdError(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<reqwest::Error> for EcloadError {
|
||||
fn from(e: reqwest::Error) -> Self {
|
||||
EcloadError::ReqwestError(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<printpdf::Error> for EcloadError {
|
||||
fn from(e: printpdf::Error) -> Self {
|
||||
EcloadError::PrintPdfError(e)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<image::ImageError> for EcloadError {
|
||||
fn from(e: image::ImageError) -> Self {
|
||||
EcloadError::ImageError(e)
|
||||
}
|
||||
}
|
281
src/loader.rs
Normal file
281
src/loader.rs
Normal file
|
@ -0,0 +1,281 @@
|
|||
use std::fs;
|
||||
use std::fs::DirEntry;
|
||||
use std::fs::File;
|
||||
use std::io::BufWriter;
|
||||
use std::io::copy;
|
||||
use std::path::Path;
|
||||
|
||||
use image::DynamicImage;
|
||||
use image::GenericImageView;
|
||||
use printpdf::{Image, Mm, PdfDocument};
|
||||
use scraper::{Html, Selector};
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::error::EcloadError;
|
||||
|
||||
const BASEURL: &str = "https://www.e-codices.unifr.ch";
|
||||
const DPI: f64 = 300.0;
|
||||
const INCH_AS_MM: f64 = 25.4;
|
||||
|
||||
/// Provides functions to download e-codices.
|
||||
pub struct EcLoader {}
|
||||
|
||||
arg_enum! {
|
||||
#[derive(PartialEq, Debug)]
|
||||
/// Defines the different sizes that e-codice pages can be.
|
||||
pub enum PageFormat {
|
||||
Small,
|
||||
Medium,
|
||||
Large,
|
||||
Max,
|
||||
}
|
||||
}
|
||||
|
||||
impl PageFormat {
|
||||
/// Converts the formats in the represantation that the e-codice website uses.
|
||||
fn as_url_part(&self) -> &str {
|
||||
match self {
|
||||
PageFormat::Small => "small",
|
||||
PageFormat::Medium => "medium",
|
||||
PageFormat::Large => "large",
|
||||
PageFormat::Max => "max",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EcLoader {
|
||||
/// Download an e-codice from https://www.e-codices.unifr.ch.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003").
|
||||
/// * `title` - Title of the pdf, this will also be the filename.
|
||||
/// * `out_dir` - Directory where the PDF gets saved (must already exist).
|
||||
/// * `format` - The size of the page to look for.
|
||||
pub fn download_codice(
|
||||
id: &str,
|
||||
title: &str,
|
||||
out_dir: &Path,
|
||||
format: PageFormat,
|
||||
) -> Result<(), EcloadError> {
|
||||
let tmp_dir = tempfile::tempdir()?;
|
||||
debug!("temporary download directory: {}", tmp_dir.path().display());
|
||||
|
||||
EcLoader::download_pages(id, format, &tmp_dir)
|
||||
.and_then(|_| EcLoader::build_pdf(title, out_dir, &tmp_dir))
|
||||
}
|
||||
|
||||
/// Order files in a temporary directory
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `tmp_dir`- Reference to the temporary directory.
|
||||
fn get_ordered_tmp_files(tmp_dir: &TempDir) -> Result<Vec<DirEntry>, EcloadError> {
|
||||
let paths = fs::read_dir(tmp_dir.path())?;
|
||||
let mut sorted_paths: Vec<DirEntry> = paths.filter_map(|r| r.ok()).collect();
|
||||
|
||||
sorted_paths.sort_by_key(|dir| dir.path());
|
||||
|
||||
Ok(sorted_paths)
|
||||
}
|
||||
|
||||
/// Calculate image dimensions in mm.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `image` - Image for the calculation.
|
||||
fn mm_from_image(image: &DynamicImage) -> (Mm, Mm) {
|
||||
let (width, height) = image.dimensions();
|
||||
let mm_width = Mm(width as f64 * (INCH_AS_MM / DPI));
|
||||
let mm_height = Mm(height as f64 * (INCH_AS_MM / DPI));
|
||||
|
||||
(mm_width, mm_height)
|
||||
}
|
||||
|
||||
/// Convert an image to BMP.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `img_path` - Path to the image that should be converted.
|
||||
///
|
||||
/// # Remarks
|
||||
///
|
||||
/// The BMP file gets saved to the same directory as the original, with the same name apart from
|
||||
/// the extension.
|
||||
fn get_img_as_bmp(img_path: &Path) -> Result<DynamicImage, EcloadError> {
|
||||
let image = image::open(img_path)?;
|
||||
let img_bmp_path = img_path.with_extension("bmp");
|
||||
image.save(&img_bmp_path)?;
|
||||
|
||||
Ok(image::open(img_bmp_path)?)
|
||||
}
|
||||
|
||||
/// Generate a PDF from images inside a directory.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `title` - Title of the pdf, this will also be the filename.
|
||||
/// * `out_dir` - Directory where the PDF gets saved (must already exist).
|
||||
/// * `tmp_dir` - Reference to the temporary directory where the images for the pdf are.
|
||||
fn build_pdf(title: &str, out_dir: &Path, tmp_dir: &TempDir) -> Result<(), EcloadError> {
|
||||
let ordered_tmp_files = EcLoader::get_ordered_tmp_files(tmp_dir)?;
|
||||
let first_file = match ordered_tmp_files.first() {
|
||||
Some(file) => file,
|
||||
None => return Err(EcloadError::Error("no tmp files".to_string())),
|
||||
};
|
||||
|
||||
let first_image_data = image::open(first_file.path())?;
|
||||
let (first_img_width, first_img_height) = EcLoader::mm_from_image(&first_image_data);
|
||||
|
||||
let (doc, page1, layer1) =
|
||||
PdfDocument::new(title, first_img_width, first_img_height, "Layer 1");
|
||||
let mut current_layer = doc.get_page(page1).get_layer(layer1);
|
||||
|
||||
let mut is_first_page = true;
|
||||
let mut page_count = 1;
|
||||
for entry in ordered_tmp_files {
|
||||
let filename_os = entry.file_name();
|
||||
let filename = match filename_os.to_str() {
|
||||
Some(filename) => filename,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
if !filename.ends_with(".jpg") {
|
||||
continue;
|
||||
}
|
||||
|
||||
info!("saving page {} to pdf...", page_count);
|
||||
let image_bmp = match EcLoader::get_img_as_bmp(&entry.path()) {
|
||||
Ok(image_data) => image_data,
|
||||
Err(err) => {
|
||||
error!("could not decode {}: {:?}", entry.path().display(), err);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut image_file = File::open(entry.path().with_extension("bmp")).unwrap();
|
||||
let image = Image::try_from(image::bmp::BMPDecoder::new(&mut image_file)).unwrap();
|
||||
|
||||
let (img_width, img_height) = EcLoader::mm_from_image(&image_bmp);
|
||||
debug!("dimensions: {:?} x {:?}", img_width, img_height);
|
||||
|
||||
if !is_first_page {
|
||||
let (new_page, new_layer) = doc.add_page(img_width, img_height, "Layer 1");
|
||||
current_layer = doc.get_page(new_page).get_layer(new_layer);
|
||||
}
|
||||
|
||||
image.add_to_layer(
|
||||
current_layer.clone(),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
Some(DPI),
|
||||
);
|
||||
|
||||
is_first_page = false;
|
||||
page_count += 1;
|
||||
}
|
||||
|
||||
let pdf_file_path = out_dir.join(format!("{}.pdf", title));
|
||||
info!("saved to {}", pdf_file_path.display());
|
||||
let pdf_file = File::create(pdf_file_path)?;
|
||||
doc.save(&mut BufWriter::new(pdf_file))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_thumbnail_url(id: &str) -> String {
|
||||
format!("{}/en/thumbs/{}", BASEURL, id)
|
||||
}
|
||||
|
||||
/// Download all the pages of an e-codice to a temporary directory.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `id` - The combined id ot the e-codice. Consisting of the library id and the e-codice id (for example "bbb/0003").
|
||||
/// * `format` - The size of the page to look for.
|
||||
/// * `tmp_dir` - Reference to a temporary directory where the downloaded files get saved.
|
||||
fn download_pages(id: &str, format: PageFormat, tmp_dir: &TempDir) -> Result<(), EcloadError> {
|
||||
let thumbnail_url = EcLoader::get_thumbnail_url(id);
|
||||
let mut response = reqwest::get(thumbnail_url.as_str())?;
|
||||
let html = response.text()?;
|
||||
let fragment = Html::parse_fragment(&html);
|
||||
|
||||
let li_selector = Selector::parse("div.thumbnail-image > a").unwrap();
|
||||
|
||||
let mut page_count = 0;
|
||||
for element in fragment.select(&li_selector) {
|
||||
let page_link = match element.value().attr("href") {
|
||||
Some(href) => href.to_string(),
|
||||
None => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
debug!("searching download links for {}...", page_link);
|
||||
let download_link = match EcLoader::get_download_link(&page_link, &format) {
|
||||
Ok(download_link) => download_link,
|
||||
Err(err) => {
|
||||
error!(
|
||||
"could not find download link for {} with format {}: {:?}",
|
||||
&page_link, format, err
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
debug!("found {}", download_link);
|
||||
|
||||
info!("downloading {}...", download_link);
|
||||
let page_file_path = tmp_dir
|
||||
.path()
|
||||
.join(format!("{:0>5}.jpg", page_count.to_string()));
|
||||
|
||||
let mut page_download = match reqwest::get(&download_link) {
|
||||
Ok(response) => response,
|
||||
Err(err) => {
|
||||
error!("could not download {}: {:?}", download_link, err);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let mut dest = File::create(page_file_path)?;
|
||||
copy(&mut page_download, &mut dest)?;
|
||||
|
||||
page_count += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Finds the image url for a specified size for an e-codice page.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `page_link` - The specific e-codice page url.
|
||||
/// * `format` - The size of the page to look for.
|
||||
fn get_download_link(page_link: &str, format: &PageFormat) -> Result<String, EcloadError> {
|
||||
let mut response = reqwest::get(page_link)?;
|
||||
let html = response.text()?;
|
||||
let fragment = Html::parse_fragment(&html);
|
||||
|
||||
let a_selector = Selector::parse("ul.download-page-list > li > a").unwrap();
|
||||
for element in fragment.select(&a_selector) {
|
||||
let download_link = match element.value().attr("href") {
|
||||
Some(href) => href.to_string(),
|
||||
None => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if download_link.ends_with(format.as_url_part()) {
|
||||
return Ok(download_link);
|
||||
}
|
||||
}
|
||||
|
||||
Err(EcloadError::Error(
|
||||
"could not find download link".to_string(),
|
||||
))
|
||||
}
|
||||
}
|
84
src/main.rs
Normal file
84
src/main.rs
Normal file
|
@ -0,0 +1,84 @@
|
|||
#[macro_use]
|
||||
extern crate clap;
|
||||
extern crate image;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
extern crate printpdf;
|
||||
extern crate reqwest;
|
||||
extern crate scraper;
|
||||
extern crate simple_logger;
|
||||
extern crate tempfile;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use clap::{App, AppSettings, Arg};
|
||||
|
||||
use crate::error::EcloadError;
|
||||
use crate::loader::EcLoader;
|
||||
|
||||
pub mod error;
|
||||
pub mod loader;
|
||||
|
||||
fn main() {
|
||||
simple_logger::init_with_level(log::Level::Info).unwrap();
|
||||
|
||||
let app = App::new("ecload")
|
||||
.version("0.1.0")
|
||||
.author("Sebastian Hugentobler <sebastian@vanwa.ch>")
|
||||
.about("Download books from https://www.e-codices.unifr.ch")
|
||||
.arg(
|
||||
Arg::with_name("output directory")
|
||||
.short("o")
|
||||
.long("out-dir")
|
||||
.value_name("DIR")
|
||||
.help("The directory where the resulting pdf is saved.")
|
||||
.takes_value(true)
|
||||
.default_value("."),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("size")
|
||||
.short("s")
|
||||
.long("size")
|
||||
.value_name("SIZE")
|
||||
.help("Sets the size of the downloaded images.")
|
||||
.takes_value(true)
|
||||
.possible_values(&loader::PageFormat::variants())
|
||||
.default_value("medium")
|
||||
.case_insensitive(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("id")
|
||||
.short("i")
|
||||
.long("id")
|
||||
.value_name("ID")
|
||||
.help("Id of the book to download. Copy the last to url parts on the overview page to get it (for example bbb/0003).")
|
||||
.takes_value(true),
|
||||
)
|
||||
.setting(AppSettings::ArgRequiredElseHelp);
|
||||
|
||||
let matches = app.clone().get_matches();
|
||||
|
||||
let out_path_str = matches.value_of("output directory").unwrap();
|
||||
let out_path = Path::new(out_path_str);
|
||||
let size = value_t!(matches.value_of("size"), loader::PageFormat).unwrap();
|
||||
let id = match matches.value_of("id") {
|
||||
Some(id) => id,
|
||||
None => {
|
||||
println!("ID is a required argument:");
|
||||
println!();
|
||||
app.clone().print_help().ok();
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let title = &id.replace("/", "_");
|
||||
|
||||
match run(&out_path, size, id, title) {
|
||||
Ok(_) => info!("finished!"),
|
||||
Err(e) => error!("something went wrong: {:?}", e),
|
||||
};
|
||||
}
|
||||
|
||||
fn run(out_path: &Path, format: loader::PageFormat, id: &str, title: &str) -> Result<(), EcloadError> {
|
||||
EcLoader::download_codice(id, title, out_path, format)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue