change from rust to go as I could not figure out the image loading problem in rust
This commit is contained in:
parent
eb7502f52e
commit
e3b2b03d55
23 changed files with 631 additions and 2680 deletions
104
pkg/ecload/ecload.go
Normal file
104
pkg/ecload/ecload.go
Normal file
|
@ -0,0 +1,104 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
||||
|
||||
package ecload
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
const BASEURL = "https://www.e-codices.unifr.ch"
|
||||
const THUMBNAILURL = "%s/en/thumbs/%s"
|
||||
|
||||
// Save a whole e-codice to the specified directory.
|
||||
// If the directory does not exist, it will be created.
|
||||
// size must be one of small, medium, large, max.
|
||||
func DownloadBook(outDir string, size string, id string, logger Logger) error {
|
||||
logger.Info.Println("finding pages...")
|
||||
|
||||
pageUrls, err := getPageLinks(id)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dir, err := ioutil.TempDir("", "ecload")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer os.RemoveAll(dir)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
logger.Info.Println("downloading pages...")
|
||||
for index, pageUrl := range pageUrls {
|
||||
downloadUrl, err:= getSizeLink(pageUrl, size)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
filename := fmt.Sprintf("%06d.jpg", index)
|
||||
go downloadToFile(filename, dir, downloadUrl, wg)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
err = os.MkdirAll(outDir, os.ModePerm)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
pdfPath := path.Join(outDir, fmt.Sprintf("%s.pdf", strings.ReplaceAll(id, "/", "_")))
|
||||
logger.Info.Printf("Saving pdf to %s...", pdfPath)
|
||||
|
||||
return ImgsToPdf(dir, pdfPath)
|
||||
}
|
||||
|
||||
// Find the download link for a page of a specific size.
|
||||
func getSizeLink(pageUrl string , size string) (string, error) {
|
||||
doc, err := fetchDocument(pageUrl)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
downloadSizeUrl := ""
|
||||
|
||||
doc.Find("ul.download-page-list > li > a").Each(func(i int, s *goquery.Selection) {
|
||||
downloadUrl, ok := s.Attr("href")
|
||||
|
||||
if ok && strings.HasSuffix(downloadUrl, fmt.Sprintf("/%s", size)) {
|
||||
downloadSizeUrl = downloadUrl
|
||||
}
|
||||
})
|
||||
|
||||
return downloadSizeUrl, nil
|
||||
}
|
||||
|
||||
// Find page urls for an e-codice.
|
||||
// It loads the thumbnail page and parses the html.
|
||||
func getPageLinks(id string) ([]string, error) {
|
||||
thumbnailsUrl := fmt.Sprintf(THUMBNAILURL, BASEURL, id)
|
||||
|
||||
doc, err := fetchDocument(thumbnailsUrl)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
pageUrls := make([]string, 0)
|
||||
doc.Find("div.thumbnail-image > a").Each(func(i int, s *goquery.Selection) {
|
||||
pageUrl, ok := s.Attr("href")
|
||||
|
||||
if ok {
|
||||
pageUrls = append(pageUrls, pageUrl)
|
||||
}
|
||||
})
|
||||
|
||||
return pageUrls, nil
|
||||
}
|
68
pkg/ecload/fetcher.go
Normal file
68
pkg/ecload/fetcher.go
Normal file
|
@ -0,0 +1,68 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
||||
|
||||
package ecload
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"path"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Download a html page from an url (must be UTF-8) and convert it to a goquery document.
|
||||
func fetchDocument(url string) (*goquery.Document, error) {
|
||||
res, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
// Download a file.
|
||||
func downloadToFile(filename string, dir string, pageUrl string, wg sync.WaitGroup) error {
|
||||
wg.Add(1)
|
||||
defer wg.Done()
|
||||
|
||||
fullpath := path.Join(dir, filename)
|
||||
|
||||
out, err := os.Create(fullpath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer out.Close()
|
||||
|
||||
res, err := http.Get(pageUrl)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer res.Body.Close()
|
||||
|
||||
if res.StatusCode != 200 {
|
||||
return fmt.Errorf("status code error: %d %s", res.StatusCode, res.Status)
|
||||
}
|
||||
|
||||
_, err = io.Copy(out, res.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
14
pkg/ecload/logger.go
Normal file
14
pkg/ecload/logger.go
Normal file
|
@ -0,0 +1,14 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
||||
|
||||
package ecload
|
||||
|
||||
import "log"
|
||||
|
||||
type Logger struct {
|
||||
Trace *log.Logger
|
||||
Info *log.Logger
|
||||
Warning *log.Logger
|
||||
Error *log.Logger
|
||||
}
|
51
pkg/ecload/pdf.go
Normal file
51
pkg/ecload/pdf.go
Normal file
|
@ -0,0 +1,51 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at https://mozilla.org/MPL/2.0/. */
|
||||
|
||||
package ecload
|
||||
|
||||
import (
|
||||
"image"
|
||||
_ "image/jpeg"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
|
||||
"github.com/jung-kurt/gofpdf"
|
||||
)
|
||||
|
||||
// Concatenate all jpg files in a directory to a single pdf.
|
||||
func ImgsToPdf(dir string, output string) error {
|
||||
pdf := gofpdf.New("P", "mm", "", "")
|
||||
|
||||
files, err := ioutil.ReadDir(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, f := range files {
|
||||
filepath := path.Join(dir, f.Name())
|
||||
|
||||
reader, err := os.Open(filepath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
img, _, err := image.DecodeConfig(reader)
|
||||
if err != nil {
|
||||
//return err
|
||||
continue
|
||||
}
|
||||
|
||||
pdf.AddPageFormat("P", gofpdf.SizeType{Wd: float64(img.Width), Ht: float64(img.Height)})
|
||||
|
||||
opt := gofpdf.ImageOptions{ImageType: "jpg", ReadDpi: true}
|
||||
pdf.RegisterImageOptionsReader(f.Name(), opt, reader)
|
||||
pdf.ImageOptions(f.Name(), 0, 0, 0, 0, false, opt, 0, "")
|
||||
|
||||
|
||||
reader.Close()
|
||||
}
|
||||
|
||||
return pdf.OutputFileAndClose(output)
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue