ecload/pkg/ecload/ecload.go

105 lines
2.3 KiB
Go

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
package ecload
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"io/ioutil"
"os"
"path"
"strings"
"sync"
)
const BASEURL = "https://www.e-codices.unifr.ch"
const THUMBNAILURL = "%s/en/thumbs/%s"
// Save a whole e-codice to the specified directory.
// If the directory does not exist, it will be created.
// size must be one of small, medium, large, max.
func DownloadBook(outDir string, size string, id string, logger Logger) error {
logger.Info.Println("finding pages...")
pageUrls, err := getPageLinks(id)
if err != nil {
return err
}
dir, err := ioutil.TempDir("", "ecload")
if err != nil {
return err
}
defer os.RemoveAll(dir)
var wg sync.WaitGroup
logger.Info.Println("downloading pages...")
for index, pageUrl := range pageUrls {
downloadUrl, err:= getSizeLink(pageUrl, size)
if err != nil {
return err
}
filename := fmt.Sprintf("%06d.jpg", index)
go downloadToFile(filename, dir, downloadUrl, wg)
}
wg.Wait()
err = os.MkdirAll(outDir, os.ModePerm)
if err != nil {
return err
}
pdfPath := path.Join(outDir, fmt.Sprintf("%s.pdf", strings.ReplaceAll(id, "/", "_")))
logger.Info.Printf("Saving pdf to %s...", pdfPath)
return ImgsToPdf(dir, pdfPath)
}
// Find the download link for a page of a specific size.
func getSizeLink(pageUrl string , size string) (string, error) {
doc, err := fetchDocument(pageUrl)
if err != nil {
return "", err
}
downloadSizeUrl := ""
doc.Find("ul.download-page-list > li > a").Each(func(i int, s *goquery.Selection) {
downloadUrl, ok := s.Attr("href")
if ok && strings.HasSuffix(downloadUrl, fmt.Sprintf("/%s", size)) {
downloadSizeUrl = downloadUrl
}
})
return downloadSizeUrl, nil
}
// Find page urls for an e-codice.
// It loads the thumbnail page and parses the html.
func getPageLinks(id string) ([]string, error) {
thumbnailsUrl := fmt.Sprintf(THUMBNAILURL, BASEURL, id)
doc, err := fetchDocument(thumbnailsUrl)
if err != nil {
return nil, err
}
pageUrls := make([]string, 0)
doc.Find("div.thumbnail-image > a").Each(func(i int, s *goquery.Selection) {
pageUrl, ok := s.Attr("href")
if ok {
pageUrls = append(pageUrls, pageUrl)
}
})
return pageUrls, nil
}