ecload/pkg/ecload/ecload.go
2019-05-28 16:35:01 +02:00

137 lines
2.8 KiB
Go

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
package ecload
import (
"fmt"
"github.com/PuerkitoBio/goquery"
"io/ioutil"
"os"
"path"
"strings"
"sync"
)
const MAX_DOWNLOADS = 10
const BASEURL = "https://www.e-codices.unifr.ch"
const THUMBNAILURL = "%s/en/thumbs/%s"
// Save a whole e-codice to the specified directory.
// If the directory does not exist, it will be created.
// size must be one of small, medium, large, max.
func DownloadBook(outDir string, size string, id string, logger Logger) error {
logger.Info.Println("finding pages...")
pageUrls, err := getPageLinks(id)
if err != nil {
return err
}
dir, err := ioutil.TempDir("", "ecload")
if err != nil {
return err
}
defer os.RemoveAll(dir)
var wg sync.WaitGroup
semaphore := make(chan struct{}, MAX_DOWNLOADS)
logger.Info.Println("downloading pages...")
ec := make(chan error)
finished := make(chan bool, 1)
for index, pageUrl := range pageUrls {
wg.Add(1)
go func(index int, pageUrl string) {
defer wg.Done()
semaphore <- struct{}{}
defer func() {
<-semaphore
}()
downloadUrl, err := getSizeLink(pageUrl, size)
if err != nil {
ec <- err
}
filename := fmt.Sprintf("%06d.jpg", index)
err = downloadToFile(filename, dir, downloadUrl)
if err != nil {
ec <- err
}
}(index, pageUrl)
}
go func() {
wg.Wait()
close(finished)
}()
select {
case <-finished:
case err := <-ec:
if err != nil {
return err
}
}
err = os.MkdirAll(outDir, os.ModePerm)
if err != nil {
return err
}
pdfPath := path.Join(outDir, fmt.Sprintf("%s-%s.pdf", strings.ReplaceAll(id, "/", "_"), size))
logger.Info.Printf("Saving pdf to %s...", pdfPath)
return ImgDirToPdf(dir, pdfPath)
}
// Find the download link for a page of a specific size.
func getSizeLink(pageUrl string, size string) (string, error) {
doc, err := fetchDocument(pageUrl)
if err != nil {
return "", err
}
downloadSizeUrl := ""
doc.Find("ul.download-page-list > li > a").Each(func(i int, s *goquery.Selection) {
downloadUrl, ok := s.Attr("href")
if ok && strings.HasSuffix(downloadUrl, fmt.Sprintf("/%s", size)) {
downloadSizeUrl = downloadUrl
}
})
return downloadSizeUrl, nil
}
// Find page urls for an e-codice.
// It loads the thumbnail page and parses the html.
func getPageLinks(id string) ([]string, error) {
thumbnailsUrl := fmt.Sprintf(THUMBNAILURL, BASEURL, id)
doc, err := fetchDocument(thumbnailsUrl)
if err != nil {
return nil, err
}
pageUrls := make([]string, 0)
doc.Find("div.thumbnail-image > a").Each(func(i int, s *goquery.Selection) {
pageUrl, ok := s.Attr("href")
if ok {
pageUrls = append(pageUrls, pageUrl)
}
})
return pageUrls, nil
}