Refactor the epub parser to it's own module

This commit is contained in:
Las Zenow 2015-01-25 23:37:52 -05:00
parent 08ccdaf27d
commit 09536bd0d8
5 changed files with 111 additions and 121 deletions

5
README
View file

@ -34,7 +34,7 @@ Yo also need to install go dependences:
Now you can install Trantor itself:
# go get -tags prod gitlab.com/trantor/trantor
# go get gitlab.com/trantor/trantor
You can run trantor in /srv/www/trantor i.e. For this:
@ -57,8 +57,7 @@ Go to your browser to: http://localhost:8080
Edit config.go if you want to change the port and other configuration, by default is 8080
Now you can compile Trantor:
$ go build -tags prod
(remove '-tags prod' for a faster compilation without language guessing)
$ go build
Now you can run it:
$ ./trantor

View file

@ -1,16 +0,0 @@
// +build !prod
// This is a dummy implementation of GuessLang used to make the compilation faster on development
//
// To build trantor with the proper language guessing do:
// $ go build -tags prod
package main
import (
"github.com/meskio/epubgo"
)
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
return orig_langs
}

View file

@ -1,6 +1,4 @@
// +build prod
package main
package parser
import (
"io/ioutil"

100
parser/parser.go Normal file
View file

@ -0,0 +1,100 @@
package parser
import (
"regexp"
"strings"
"git.gitorious.org/go-pkg/epubgo.git"
)
type MetaData map[string]interface{}
func EpubMetadata(epub *epubgo.Epub) MetaData {
metadata := MetaData{}
for _, m := range epub.MetadataFields() {
data, err := epub.Metadata(m)
if err != nil {
continue
}
switch m {
case "creator":
metadata["author"] = parseAuthr(data)
case "description":
metadata[m] = parseDescription(data)
case "subject":
metadata[m] = parseSubject(data)
case "date":
metadata[m] = parseDate(data)
case "language":
metadata["lang"] = GuessLang(epub, data)
case "title", "contributor", "publisher":
metadata[m] = cleanStr(strings.Join(data, ", "))
case "identifier":
attr, _ := epub.MetadataAttr(m)
for i, d := range data {
if attr[i]["scheme"] == "ISBN" {
metadata["isbn"] = d
}
}
default:
metadata[m] = strings.Join(data, ", ")
}
}
return metadata
}
func cleanStr(str string) string {
str = strings.Replace(str, "'", "'", -1)
exp, _ := regexp.Compile("&[^;]*;")
str = exp.ReplaceAllString(str, "")
exp, _ = regexp.Compile("[ ,]*$")
str = exp.ReplaceAllString(str, "")
return str
}
func parseAuthr(creator []string) []string {
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
res := make([]string, len(creator))
for i, s := range creator {
auth := exp1.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(strings.Join(auth[2:], ", "))
} else {
auth := exp2.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(auth[1])
} else {
res[i] = cleanStr(s)
}
}
}
return res
}
func parseDescription(description []string) string {
str := cleanStr(strings.Join(description, "\n"))
str = strings.Replace(str, "</p>", "\n", -1)
exp, _ := regexp.Compile("<[^>]*>")
str = exp.ReplaceAllString(str, "")
str = strings.Replace(str, "&amp;", "&", -1)
str = strings.Replace(str, "&lt;", "<", -1)
str = strings.Replace(str, "&gt;", ">", -1)
str = strings.Replace(str, "\\n", "\n", -1)
return str
}
func parseSubject(subject []string) []string {
var res []string
for _, s := range subject {
res = append(res, strings.Split(s, " / ")...)
}
return res
}
func parseDate(date []string) string {
if len(date) == 0 {
return ""
}
return strings.Replace(date[0], "Unspecified: ", "", -1)
}

107
upload.go
View file

@ -8,11 +8,10 @@ import (
"encoding/base64"
"io/ioutil"
"mime/multipart"
"regexp"
"strings"
"github.com/meskio/epubgo"
"gitlab.com/trantor/trantor/database"
"gitlab.com/trantor/trantor/parser"
"gitlab.com/trantor/trantor/storage"
)
@ -47,7 +46,11 @@ func processFile(req uploadRequest, db *database.DB, store *storage.Store) {
}
defer epub.Close()
book, id := parseFile(epub, store)
id := genId()
metadata := parser.EpubMetadata(epub)
metadata["id"] = id
metadata["cover"] = GetCover(epub, id, store)
req.file.Seek(0, 0)
size, err := store.Store(id, req.file, EPUB_FILE)
if err != nil {
@ -55,8 +58,8 @@ func processFile(req uploadRequest, db *database.DB, store *storage.Store) {
return
}
book["filesize"] = size
err = db.AddBook(book)
metadata["filesize"] = size
err = db.AddBook(metadata)
if err != nil {
log.Error("Error storing metadata (", id, "): ", err)
return
@ -107,102 +110,8 @@ func openMultipartEpub(file multipart.File) (*epubgo.Epub, error) {
return epubgo.Load(reader, int64(len(buff)))
}
func parseFile(epub *epubgo.Epub, store *storage.Store) (metadata map[string]interface{}, id string) {
book := map[string]interface{}{}
for _, m := range epub.MetadataFields() {
data, err := epub.Metadata(m)
if err != nil {
continue
}
switch m {
case "creator":
book["author"] = parseAuthr(data)
case "description":
book[m] = parseDescription(data)
case "subject":
book[m] = parseSubject(data)
case "date":
book[m] = parseDate(data)
case "language":
book["lang"] = GuessLang(epub, data)
case "title", "contributor", "publisher":
book[m] = cleanStr(strings.Join(data, ", "))
case "identifier":
attr, _ := epub.MetadataAttr(m)
for i, d := range data {
if attr[i]["scheme"] == "ISBN" {
book["isbn"] = d
}
}
default:
book[m] = strings.Join(data, ", ")
}
}
id = genId()
book["id"] = id
book["cover"] = GetCover(epub, id, store)
return book, id
}
func genId() string {
b := make([]byte, 12)
rand.Read(b)
return base64.URLEncoding.EncodeToString(b)
}
func cleanStr(str string) string {
str = strings.Replace(str, "&#39;", "'", -1)
exp, _ := regexp.Compile("&[^;]*;")
str = exp.ReplaceAllString(str, "")
exp, _ = regexp.Compile("[ ,]*$")
str = exp.ReplaceAllString(str, "")
return str
}
func parseAuthr(creator []string) []string {
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
res := make([]string, len(creator))
for i, s := range creator {
auth := exp1.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(strings.Join(auth[2:], ", "))
} else {
auth := exp2.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(auth[1])
} else {
res[i] = cleanStr(s)
}
}
}
return res
}
func parseDescription(description []string) string {
str := cleanStr(strings.Join(description, "\n"))
str = strings.Replace(str, "</p>", "\n", -1)
exp, _ := regexp.Compile("<[^>]*>")
str = exp.ReplaceAllString(str, "")
str = strings.Replace(str, "&amp;", "&", -1)
str = strings.Replace(str, "&lt;", "<", -1)
str = strings.Replace(str, "&gt;", ">", -1)
str = strings.Replace(str, "\\n", "\n", -1)
return str
}
func parseSubject(subject []string) []string {
var res []string
for _, s := range subject {
res = append(res, strings.Split(s, " / ")...)
}
return res
}
func parseDate(date []string) string {
if len(date) == 0 {
return ""
}
return strings.Replace(date[0], "Unspecified: ", "", -1)
}