From 2a88e5c9005018430305b65f231dba9fe2feeb6d Mon Sep 17 00:00:00 2001 From: Sebastian Hugentobler Date: Tue, 28 May 2019 11:28:20 +0200 Subject: [PATCH] first usable version --- .gitignore | 10 ++-------- Makefile | 6 +++--- cmd/ecload/main.go | 2 +- pkg/ecload/ecload.go | 29 ++++++++++++++++++++++------- pkg/ecload/fetcher.go | 6 +----- pkg/ecload/pdf.go | 41 +++++++++++++++++++---------------------- 6 files changed, 48 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index 2055b1e..03cd335 100644 --- a/.gitignore +++ b/.gitignore @@ -3,11 +3,5 @@ *.swp **/.idea/workspace.xml **/.idea/tasks.xml -bin -*.exe -*.dll -*.so -*.dylib -*.test -*.out -.glide/ +*.pdf +bin/ diff --git a/Makefile b/Makefile index e10eb75..71c408f 100644 --- a/Makefile +++ b/Makefile @@ -6,11 +6,11 @@ clean: rm -r bin/ go clean ./cmd/ecload/ ./pkg/ecload/ -bin/ecload: +bin/ecload: cmd/ecload/*.go pkg/ecload/*.go GOOS=linux GOARCH=amd64 go build -ldflags '-s' -v -o $@ cmd/ecload/main.go -bin/ecload.exe: +bin/ecload.exe: cmd/ecload/*.go pkg/ecload/*.go GOOS=windows GOARCH=amd64 go build -ldflags '-s' -v -o $@ cmd/ecload/main.go -bin/ecload-mac: +bin/ecload-mac: cmd/ecload/*.go pkg/ecload/*.go GOOS=darwin GOARCH=amd64 go build -ldflags '-s' -v -o $@ cmd/ecload/main.go diff --git a/cmd/ecload/main.go b/cmd/ecload/main.go index b74d9a4..b4da6c6 100644 --- a/cmd/ecload/main.go +++ b/cmd/ecload/main.go @@ -26,7 +26,7 @@ func initLogger( Trace: log.New(traceHandle, "TRACE: ", log.Ldate|log.Ltime), Info: log.New(infoHandle, "INFO: ", log.Ldate|log.Ltime), Warning: log.New(warningHandle, "WARNING: ", log.Ldate|log.Ltime), - Error: log.New(errorHandle, "ERROR: ", log.Ldate|log.Ltime), + Error: log.New(errorHandle, "ERROR: ", log.Ldate|log.Ltime|log.Lshortfile), } } diff --git a/pkg/ecload/ecload.go b/pkg/ecload/ecload.go index b1e17d7..4f21a90 100644 --- a/pkg/ecload/ecload.go +++ b/pkg/ecload/ecload.go @@ -14,6 +14,8 @@ import ( "sync" ) +const MAX_DOWNLOADS = 10 + const BASEURL = "https://www.e-codices.unifr.ch" const THUMBNAILURL = "%s/en/thumbs/%s" @@ -36,16 +38,29 @@ func DownloadBook(outDir string, size string, id string, logger Logger) error { defer os.RemoveAll(dir) var wg sync.WaitGroup + semaphore := make(chan struct{}, MAX_DOWNLOADS) logger.Info.Println("downloading pages...") for index, pageUrl := range pageUrls { - downloadUrl, err:= getSizeLink(pageUrl, size) - if err != nil { - return err - } + wg.Add(1) - filename := fmt.Sprintf("%06d.jpg", index) - go downloadToFile(filename, dir, downloadUrl, wg) + go func(index int, pageUrl string) { + defer wg.Done() + semaphore <- struct{}{} + + defer func() { + <-semaphore + }() + + downloadUrl, err:= getSizeLink(pageUrl, size) + if err != nil { + //return err + } + + filename := fmt.Sprintf("%06d.jpg", index) + downloadToFile(filename, dir, downloadUrl) + + }(index, pageUrl) } wg.Wait() @@ -58,7 +73,7 @@ func DownloadBook(outDir string, size string, id string, logger Logger) error { pdfPath := path.Join(outDir, fmt.Sprintf("%s.pdf", strings.ReplaceAll(id, "/", "_"))) logger.Info.Printf("Saving pdf to %s...", pdfPath) - return ImgsToPdf(dir, pdfPath) + return ImgDirToPdf(dir, pdfPath) } // Find the download link for a page of a specific size. diff --git a/pkg/ecload/fetcher.go b/pkg/ecload/fetcher.go index 351f0ac..8bbcc54 100644 --- a/pkg/ecload/fetcher.go +++ b/pkg/ecload/fetcher.go @@ -11,7 +11,6 @@ import ( "net/http" "os" "path" - "sync" ) // Download a html page from an url (must be UTF-8) and convert it to a goquery document. @@ -35,10 +34,7 @@ func fetchDocument(url string) (*goquery.Document, error) { } // Download a file. -func downloadToFile(filename string, dir string, pageUrl string, wg sync.WaitGroup) error { - wg.Add(1) - defer wg.Done() - +func downloadToFile(filename string, dir string, pageUrl string) error { fullpath := path.Join(dir, filename) out, err := os.Create(fullpath) diff --git a/pkg/ecload/pdf.go b/pkg/ecload/pdf.go index 9df75fd..b69907f 100644 --- a/pkg/ecload/pdf.go +++ b/pkg/ecload/pdf.go @@ -5,44 +5,41 @@ package ecload import ( - "image" - _ "image/jpeg" "io/ioutil" "os" "path" + "strings" "github.com/jung-kurt/gofpdf" ) // Concatenate all jpg files in a directory to a single pdf. -func ImgsToPdf(dir string, output string) error { - pdf := gofpdf.New("P", "mm", "", "") - +func ImgDirToPdf(dir string, output string) error { files, err := ioutil.ReadDir(dir) if err != nil { return err } + pdf := gofpdf.New("P", "mm", "", "") + opt := gofpdf.ImageOptions{ImageType: "jpg", ReadDpi: true} + for _, f := range files { - filepath := path.Join(dir, f.Name()) + if strings.HasSuffix(f.Name(), ".jpg") { + filepath := path.Join(dir, f.Name()) - reader, err := os.Open(filepath) - if err != nil { - return err + reader, err := os.Open(filepath) + if err != nil { + return err + } + + pdf.RegisterImageOptionsReader(f.Name(), opt, reader) + info := pdf.RegisterImageOptions(f.Name(), opt) + pdf.AddPageFormat("P", gofpdf.SizeType{ Wd: info.Width(), Ht: info.Height() }) + + pdf.ImageOptions(f.Name(), 0, 0, info.Width(), info.Height(), false, opt, 0, "") + + reader.Close() } - - img, _, err := image.DecodeConfig(reader) - if err != nil { - return err - } - - pdf.AddPageFormat("P", gofpdf.SizeType{Wd: float64(img.Width), Ht: float64(img.Height)}) - - opt := gofpdf.ImageOptions{ImageType: "jpg", ReadDpi: true} - pdf.RegisterImageOptionsReader(f.Name(), opt, reader) - pdf.ImageOptions(f.Name(), 0, 0, 0, 0, false, opt, 0, "") - - reader.Close() } return pdf.OutputFileAndClose(output)