Refactor the epub parser to it's own module

This commit is contained in:
Las Zenow 2015-01-25 23:37:52 -05:00
parent 08ccdaf27d
commit 09536bd0d8
5 changed files with 111 additions and 121 deletions

5
README
View file

@ -34,7 +34,7 @@ Yo also need to install go dependences:
Now you can install Trantor itself: Now you can install Trantor itself:
# go get -tags prod gitlab.com/trantor/trantor # go get gitlab.com/trantor/trantor
You can run trantor in /srv/www/trantor i.e. For this: You can run trantor in /srv/www/trantor i.e. For this:
@ -57,8 +57,7 @@ Go to your browser to: http://localhost:8080
Edit config.go if you want to change the port and other configuration, by default is 8080 Edit config.go if you want to change the port and other configuration, by default is 8080
Now you can compile Trantor: Now you can compile Trantor:
$ go build -tags prod $ go build
(remove '-tags prod' for a faster compilation without language guessing)
Now you can run it: Now you can run it:
$ ./trantor $ ./trantor

View file

@ -1,16 +0,0 @@
// +build !prod
// This is a dummy implementation of GuessLang used to make the compilation faster on development
//
// To build trantor with the proper language guessing do:
// $ go build -tags prod
package main
import (
"github.com/meskio/epubgo"
)
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
return orig_langs
}

View file

@ -1,6 +1,4 @@
// +build prod package parser
package main
import ( import (
"io/ioutil" "io/ioutil"

100
parser/parser.go Normal file
View file

@ -0,0 +1,100 @@
package parser
import (
"regexp"
"strings"
"git.gitorious.org/go-pkg/epubgo.git"
)
type MetaData map[string]interface{}
func EpubMetadata(epub *epubgo.Epub) MetaData {
metadata := MetaData{}
for _, m := range epub.MetadataFields() {
data, err := epub.Metadata(m)
if err != nil {
continue
}
switch m {
case "creator":
metadata["author"] = parseAuthr(data)
case "description":
metadata[m] = parseDescription(data)
case "subject":
metadata[m] = parseSubject(data)
case "date":
metadata[m] = parseDate(data)
case "language":
metadata["lang"] = GuessLang(epub, data)
case "title", "contributor", "publisher":
metadata[m] = cleanStr(strings.Join(data, ", "))
case "identifier":
attr, _ := epub.MetadataAttr(m)
for i, d := range data {
if attr[i]["scheme"] == "ISBN" {
metadata["isbn"] = d
}
}
default:
metadata[m] = strings.Join(data, ", ")
}
}
return metadata
}
func cleanStr(str string) string {
str = strings.Replace(str, "'", "'", -1)
exp, _ := regexp.Compile("&[^;]*;")
str = exp.ReplaceAllString(str, "")
exp, _ = regexp.Compile("[ ,]*$")
str = exp.ReplaceAllString(str, "")
return str
}
func parseAuthr(creator []string) []string {
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
res := make([]string, len(creator))
for i, s := range creator {
auth := exp1.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(strings.Join(auth[2:], ", "))
} else {
auth := exp2.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(auth[1])
} else {
res[i] = cleanStr(s)
}
}
}
return res
}
func parseDescription(description []string) string {
str := cleanStr(strings.Join(description, "\n"))
str = strings.Replace(str, "</p>", "\n", -1)
exp, _ := regexp.Compile("<[^>]*>")
str = exp.ReplaceAllString(str, "")
str = strings.Replace(str, "&amp;", "&", -1)
str = strings.Replace(str, "&lt;", "<", -1)
str = strings.Replace(str, "&gt;", ">", -1)
str = strings.Replace(str, "\\n", "\n", -1)
return str
}
func parseSubject(subject []string) []string {
var res []string
for _, s := range subject {
res = append(res, strings.Split(s, " / ")...)
}
return res
}
func parseDate(date []string) string {
if len(date) == 0 {
return ""
}
return strings.Replace(date[0], "Unspecified: ", "", -1)
}

107
upload.go
View file

@ -8,11 +8,10 @@ import (
"encoding/base64" "encoding/base64"
"io/ioutil" "io/ioutil"
"mime/multipart" "mime/multipart"
"regexp"
"strings"
"github.com/meskio/epubgo" "github.com/meskio/epubgo"
"gitlab.com/trantor/trantor/database" "gitlab.com/trantor/trantor/database"
"gitlab.com/trantor/trantor/parser"
"gitlab.com/trantor/trantor/storage" "gitlab.com/trantor/trantor/storage"
) )
@ -47,7 +46,11 @@ func processFile(req uploadRequest, db *database.DB, store *storage.Store) {
} }
defer epub.Close() defer epub.Close()
book, id := parseFile(epub, store) id := genId()
metadata := parser.EpubMetadata(epub)
metadata["id"] = id
metadata["cover"] = GetCover(epub, id, store)
req.file.Seek(0, 0) req.file.Seek(0, 0)
size, err := store.Store(id, req.file, EPUB_FILE) size, err := store.Store(id, req.file, EPUB_FILE)
if err != nil { if err != nil {
@ -55,8 +58,8 @@ func processFile(req uploadRequest, db *database.DB, store *storage.Store) {
return return
} }
book["filesize"] = size metadata["filesize"] = size
err = db.AddBook(book) err = db.AddBook(metadata)
if err != nil { if err != nil {
log.Error("Error storing metadata (", id, "): ", err) log.Error("Error storing metadata (", id, "): ", err)
return return
@ -107,102 +110,8 @@ func openMultipartEpub(file multipart.File) (*epubgo.Epub, error) {
return epubgo.Load(reader, int64(len(buff))) return epubgo.Load(reader, int64(len(buff)))
} }
func parseFile(epub *epubgo.Epub, store *storage.Store) (metadata map[string]interface{}, id string) {
book := map[string]interface{}{}
for _, m := range epub.MetadataFields() {
data, err := epub.Metadata(m)
if err != nil {
continue
}
switch m {
case "creator":
book["author"] = parseAuthr(data)
case "description":
book[m] = parseDescription(data)
case "subject":
book[m] = parseSubject(data)
case "date":
book[m] = parseDate(data)
case "language":
book["lang"] = GuessLang(epub, data)
case "title", "contributor", "publisher":
book[m] = cleanStr(strings.Join(data, ", "))
case "identifier":
attr, _ := epub.MetadataAttr(m)
for i, d := range data {
if attr[i]["scheme"] == "ISBN" {
book["isbn"] = d
}
}
default:
book[m] = strings.Join(data, ", ")
}
}
id = genId()
book["id"] = id
book["cover"] = GetCover(epub, id, store)
return book, id
}
func genId() string { func genId() string {
b := make([]byte, 12) b := make([]byte, 12)
rand.Read(b) rand.Read(b)
return base64.URLEncoding.EncodeToString(b) return base64.URLEncoding.EncodeToString(b)
} }
func cleanStr(str string) string {
str = strings.Replace(str, "&#39;", "'", -1)
exp, _ := regexp.Compile("&[^;]*;")
str = exp.ReplaceAllString(str, "")
exp, _ = regexp.Compile("[ ,]*$")
str = exp.ReplaceAllString(str, "")
return str
}
func parseAuthr(creator []string) []string {
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
res := make([]string, len(creator))
for i, s := range creator {
auth := exp1.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(strings.Join(auth[2:], ", "))
} else {
auth := exp2.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(auth[1])
} else {
res[i] = cleanStr(s)
}
}
}
return res
}
func parseDescription(description []string) string {
str := cleanStr(strings.Join(description, "\n"))
str = strings.Replace(str, "</p>", "\n", -1)
exp, _ := regexp.Compile("<[^>]*>")
str = exp.ReplaceAllString(str, "")
str = strings.Replace(str, "&amp;", "&", -1)
str = strings.Replace(str, "&lt;", "<", -1)
str = strings.Replace(str, "&gt;", ">", -1)
str = strings.Replace(str, "\\n", "\n", -1)
return str
}
func parseSubject(subject []string) []string {
var res []string
for _, s := range subject {
res = append(res, strings.Split(s, " / ")...)
}
return res
}
func parseDate(date []string) string {
if len(date) == 0 {
return ""
}
return strings.Replace(date[0], "Unspecified: ", "", -1)
}