Refactor the epub parser to it's own module
This commit is contained in:
parent
08ccdaf27d
commit
09536bd0d8
5 changed files with 111 additions and 121 deletions
5
README
5
README
|
@ -34,7 +34,7 @@ Yo also need to install go dependences:
|
||||||
|
|
||||||
Now you can install Trantor itself:
|
Now you can install Trantor itself:
|
||||||
|
|
||||||
# go get -tags prod gitlab.com/trantor/trantor
|
# go get gitlab.com/trantor/trantor
|
||||||
|
|
||||||
You can run trantor in /srv/www/trantor i.e. For this:
|
You can run trantor in /srv/www/trantor i.e. For this:
|
||||||
|
|
||||||
|
@ -57,8 +57,7 @@ Go to your browser to: http://localhost:8080
|
||||||
Edit config.go if you want to change the port and other configuration, by default is 8080
|
Edit config.go if you want to change the port and other configuration, by default is 8080
|
||||||
|
|
||||||
Now you can compile Trantor:
|
Now you can compile Trantor:
|
||||||
$ go build -tags prod
|
$ go build
|
||||||
(remove '-tags prod' for a faster compilation without language guessing)
|
|
||||||
|
|
||||||
Now you can run it:
|
Now you can run it:
|
||||||
$ ./trantor
|
$ ./trantor
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
// +build !prod
|
|
||||||
|
|
||||||
// This is a dummy implementation of GuessLang used to make the compilation faster on development
|
|
||||||
//
|
|
||||||
// To build trantor with the proper language guessing do:
|
|
||||||
// $ go build -tags prod
|
|
||||||
|
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/meskio/epubgo"
|
|
||||||
)
|
|
||||||
|
|
||||||
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
|
|
||||||
return orig_langs
|
|
||||||
}
|
|
|
@ -1,6 +1,4 @@
|
||||||
// +build prod
|
package parser
|
||||||
|
|
||||||
package main
|
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"io/ioutil"
|
"io/ioutil"
|
100
parser/parser.go
Normal file
100
parser/parser.go
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
package parser
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"git.gitorious.org/go-pkg/epubgo.git"
|
||||||
|
)
|
||||||
|
|
||||||
|
type MetaData map[string]interface{}
|
||||||
|
|
||||||
|
func EpubMetadata(epub *epubgo.Epub) MetaData {
|
||||||
|
metadata := MetaData{}
|
||||||
|
for _, m := range epub.MetadataFields() {
|
||||||
|
data, err := epub.Metadata(m)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
switch m {
|
||||||
|
case "creator":
|
||||||
|
metadata["author"] = parseAuthr(data)
|
||||||
|
case "description":
|
||||||
|
metadata[m] = parseDescription(data)
|
||||||
|
case "subject":
|
||||||
|
metadata[m] = parseSubject(data)
|
||||||
|
case "date":
|
||||||
|
metadata[m] = parseDate(data)
|
||||||
|
case "language":
|
||||||
|
metadata["lang"] = GuessLang(epub, data)
|
||||||
|
case "title", "contributor", "publisher":
|
||||||
|
metadata[m] = cleanStr(strings.Join(data, ", "))
|
||||||
|
case "identifier":
|
||||||
|
attr, _ := epub.MetadataAttr(m)
|
||||||
|
for i, d := range data {
|
||||||
|
if attr[i]["scheme"] == "ISBN" {
|
||||||
|
metadata["isbn"] = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
metadata[m] = strings.Join(data, ", ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return metadata
|
||||||
|
}
|
||||||
|
|
||||||
|
func cleanStr(str string) string {
|
||||||
|
str = strings.Replace(str, "'", "'", -1)
|
||||||
|
exp, _ := regexp.Compile("&[^;]*;")
|
||||||
|
str = exp.ReplaceAllString(str, "")
|
||||||
|
exp, _ = regexp.Compile("[ ,]*$")
|
||||||
|
str = exp.ReplaceAllString(str, "")
|
||||||
|
return str
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseAuthr(creator []string) []string {
|
||||||
|
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
|
||||||
|
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
|
||||||
|
res := make([]string, len(creator))
|
||||||
|
for i, s := range creator {
|
||||||
|
auth := exp1.FindStringSubmatch(s)
|
||||||
|
if auth != nil {
|
||||||
|
res[i] = cleanStr(strings.Join(auth[2:], ", "))
|
||||||
|
} else {
|
||||||
|
auth := exp2.FindStringSubmatch(s)
|
||||||
|
if auth != nil {
|
||||||
|
res[i] = cleanStr(auth[1])
|
||||||
|
} else {
|
||||||
|
res[i] = cleanStr(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseDescription(description []string) string {
|
||||||
|
str := cleanStr(strings.Join(description, "\n"))
|
||||||
|
str = strings.Replace(str, "</p>", "\n", -1)
|
||||||
|
exp, _ := regexp.Compile("<[^>]*>")
|
||||||
|
str = exp.ReplaceAllString(str, "")
|
||||||
|
str = strings.Replace(str, "&", "&", -1)
|
||||||
|
str = strings.Replace(str, "<", "<", -1)
|
||||||
|
str = strings.Replace(str, ">", ">", -1)
|
||||||
|
str = strings.Replace(str, "\\n", "\n", -1)
|
||||||
|
return str
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseSubject(subject []string) []string {
|
||||||
|
var res []string
|
||||||
|
for _, s := range subject {
|
||||||
|
res = append(res, strings.Split(s, " / ")...)
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseDate(date []string) string {
|
||||||
|
if len(date) == 0 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
return strings.Replace(date[0], "Unspecified: ", "", -1)
|
||||||
|
}
|
107
upload.go
107
upload.go
|
@ -8,11 +8,10 @@ import (
|
||||||
"encoding/base64"
|
"encoding/base64"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"mime/multipart"
|
"mime/multipart"
|
||||||
"regexp"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/meskio/epubgo"
|
"github.com/meskio/epubgo"
|
||||||
"gitlab.com/trantor/trantor/database"
|
"gitlab.com/trantor/trantor/database"
|
||||||
|
"gitlab.com/trantor/trantor/parser"
|
||||||
"gitlab.com/trantor/trantor/storage"
|
"gitlab.com/trantor/trantor/storage"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -47,7 +46,11 @@ func processFile(req uploadRequest, db *database.DB, store *storage.Store) {
|
||||||
}
|
}
|
||||||
defer epub.Close()
|
defer epub.Close()
|
||||||
|
|
||||||
book, id := parseFile(epub, store)
|
id := genId()
|
||||||
|
metadata := parser.EpubMetadata(epub)
|
||||||
|
metadata["id"] = id
|
||||||
|
metadata["cover"] = GetCover(epub, id, store)
|
||||||
|
|
||||||
req.file.Seek(0, 0)
|
req.file.Seek(0, 0)
|
||||||
size, err := store.Store(id, req.file, EPUB_FILE)
|
size, err := store.Store(id, req.file, EPUB_FILE)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -55,8 +58,8 @@ func processFile(req uploadRequest, db *database.DB, store *storage.Store) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
book["filesize"] = size
|
metadata["filesize"] = size
|
||||||
err = db.AddBook(book)
|
err = db.AddBook(metadata)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error("Error storing metadata (", id, "): ", err)
|
log.Error("Error storing metadata (", id, "): ", err)
|
||||||
return
|
return
|
||||||
|
@ -107,102 +110,8 @@ func openMultipartEpub(file multipart.File) (*epubgo.Epub, error) {
|
||||||
return epubgo.Load(reader, int64(len(buff)))
|
return epubgo.Load(reader, int64(len(buff)))
|
||||||
}
|
}
|
||||||
|
|
||||||
func parseFile(epub *epubgo.Epub, store *storage.Store) (metadata map[string]interface{}, id string) {
|
|
||||||
book := map[string]interface{}{}
|
|
||||||
for _, m := range epub.MetadataFields() {
|
|
||||||
data, err := epub.Metadata(m)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
switch m {
|
|
||||||
case "creator":
|
|
||||||
book["author"] = parseAuthr(data)
|
|
||||||
case "description":
|
|
||||||
book[m] = parseDescription(data)
|
|
||||||
case "subject":
|
|
||||||
book[m] = parseSubject(data)
|
|
||||||
case "date":
|
|
||||||
book[m] = parseDate(data)
|
|
||||||
case "language":
|
|
||||||
book["lang"] = GuessLang(epub, data)
|
|
||||||
case "title", "contributor", "publisher":
|
|
||||||
book[m] = cleanStr(strings.Join(data, ", "))
|
|
||||||
case "identifier":
|
|
||||||
attr, _ := epub.MetadataAttr(m)
|
|
||||||
for i, d := range data {
|
|
||||||
if attr[i]["scheme"] == "ISBN" {
|
|
||||||
book["isbn"] = d
|
|
||||||
}
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
book[m] = strings.Join(data, ", ")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
id = genId()
|
|
||||||
book["id"] = id
|
|
||||||
book["cover"] = GetCover(epub, id, store)
|
|
||||||
return book, id
|
|
||||||
}
|
|
||||||
|
|
||||||
func genId() string {
|
func genId() string {
|
||||||
b := make([]byte, 12)
|
b := make([]byte, 12)
|
||||||
rand.Read(b)
|
rand.Read(b)
|
||||||
return base64.URLEncoding.EncodeToString(b)
|
return base64.URLEncoding.EncodeToString(b)
|
||||||
}
|
}
|
||||||
|
|
||||||
func cleanStr(str string) string {
|
|
||||||
str = strings.Replace(str, "'", "'", -1)
|
|
||||||
exp, _ := regexp.Compile("&[^;]*;")
|
|
||||||
str = exp.ReplaceAllString(str, "")
|
|
||||||
exp, _ = regexp.Compile("[ ,]*$")
|
|
||||||
str = exp.ReplaceAllString(str, "")
|
|
||||||
return str
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseAuthr(creator []string) []string {
|
|
||||||
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
|
|
||||||
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
|
|
||||||
res := make([]string, len(creator))
|
|
||||||
for i, s := range creator {
|
|
||||||
auth := exp1.FindStringSubmatch(s)
|
|
||||||
if auth != nil {
|
|
||||||
res[i] = cleanStr(strings.Join(auth[2:], ", "))
|
|
||||||
} else {
|
|
||||||
auth := exp2.FindStringSubmatch(s)
|
|
||||||
if auth != nil {
|
|
||||||
res[i] = cleanStr(auth[1])
|
|
||||||
} else {
|
|
||||||
res[i] = cleanStr(s)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseDescription(description []string) string {
|
|
||||||
str := cleanStr(strings.Join(description, "\n"))
|
|
||||||
str = strings.Replace(str, "</p>", "\n", -1)
|
|
||||||
exp, _ := regexp.Compile("<[^>]*>")
|
|
||||||
str = exp.ReplaceAllString(str, "")
|
|
||||||
str = strings.Replace(str, "&", "&", -1)
|
|
||||||
str = strings.Replace(str, "<", "<", -1)
|
|
||||||
str = strings.Replace(str, ">", ">", -1)
|
|
||||||
str = strings.Replace(str, "\\n", "\n", -1)
|
|
||||||
return str
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseSubject(subject []string) []string {
|
|
||||||
var res []string
|
|
||||||
for _, s := range subject {
|
|
||||||
res = append(res, strings.Split(s, " / ")...)
|
|
||||||
}
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
|
|
||||||
func parseDate(date []string) string {
|
|
||||||
if len(date) == 0 {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return strings.Replace(date[0], "Unspecified: ", "", -1)
|
|
||||||
}
|
|
||||||
|
|
Reference in a new issue