Refactor the epub parser to it's own module
This commit is contained in:
parent
08ccdaf27d
commit
09536bd0d8
5 changed files with 111 additions and 121 deletions
66
parser/language.go
Normal file
66
parser/language.go
Normal file
|
@ -0,0 +1,66 @@
|
|||
package parser
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
|
||||
"github.com/meskio/epubgo"
|
||||
"github.com/rainycape/cld2"
|
||||
)
|
||||
|
||||
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
|
||||
spine, err := epub.Spine()
|
||||
if err != nil {
|
||||
return orig_langs
|
||||
}
|
||||
|
||||
var err_spine error
|
||||
err_spine = nil
|
||||
langs := []string{}
|
||||
for err_spine == nil {
|
||||
html, err := spine.Open()
|
||||
err_spine = spine.Next()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
defer html.Close()
|
||||
|
||||
buff, err := ioutil.ReadAll(html)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
langs = append(langs, cld2.Detect(string(buff)))
|
||||
}
|
||||
|
||||
lang := commonLang(langs)
|
||||
if lang != "un" && differentLang(lang, orig_langs) {
|
||||
return []string{lang}
|
||||
}
|
||||
return orig_langs
|
||||
}
|
||||
|
||||
func commonLang(langs []string) string {
|
||||
count := map[string]int{}
|
||||
for _, l := range langs {
|
||||
count[l]++
|
||||
}
|
||||
|
||||
lang := "un"
|
||||
maxcount := 0
|
||||
for l, c := range count {
|
||||
if c > maxcount && l != "un" {
|
||||
lang = l
|
||||
maxcount = c
|
||||
}
|
||||
}
|
||||
return lang
|
||||
}
|
||||
|
||||
func differentLang(lang string, orig_langs []string) bool {
|
||||
orig_lang := "un"
|
||||
if len(orig_langs) > 0 && len(orig_langs) >= 2 {
|
||||
orig_lang = strings.ToLower(orig_langs[0][0:2])
|
||||
}
|
||||
|
||||
return orig_lang != lang
|
||||
}
|
100
parser/parser.go
Normal file
100
parser/parser.go
Normal file
|
@ -0,0 +1,100 @@
|
|||
package parser
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"git.gitorious.org/go-pkg/epubgo.git"
|
||||
)
|
||||
|
||||
type MetaData map[string]interface{}
|
||||
|
||||
func EpubMetadata(epub *epubgo.Epub) MetaData {
|
||||
metadata := MetaData{}
|
||||
for _, m := range epub.MetadataFields() {
|
||||
data, err := epub.Metadata(m)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
switch m {
|
||||
case "creator":
|
||||
metadata["author"] = parseAuthr(data)
|
||||
case "description":
|
||||
metadata[m] = parseDescription(data)
|
||||
case "subject":
|
||||
metadata[m] = parseSubject(data)
|
||||
case "date":
|
||||
metadata[m] = parseDate(data)
|
||||
case "language":
|
||||
metadata["lang"] = GuessLang(epub, data)
|
||||
case "title", "contributor", "publisher":
|
||||
metadata[m] = cleanStr(strings.Join(data, ", "))
|
||||
case "identifier":
|
||||
attr, _ := epub.MetadataAttr(m)
|
||||
for i, d := range data {
|
||||
if attr[i]["scheme"] == "ISBN" {
|
||||
metadata["isbn"] = d
|
||||
}
|
||||
}
|
||||
default:
|
||||
metadata[m] = strings.Join(data, ", ")
|
||||
}
|
||||
}
|
||||
return metadata
|
||||
}
|
||||
|
||||
func cleanStr(str string) string {
|
||||
str = strings.Replace(str, "'", "'", -1)
|
||||
exp, _ := regexp.Compile("&[^;]*;")
|
||||
str = exp.ReplaceAllString(str, "")
|
||||
exp, _ = regexp.Compile("[ ,]*$")
|
||||
str = exp.ReplaceAllString(str, "")
|
||||
return str
|
||||
}
|
||||
|
||||
func parseAuthr(creator []string) []string {
|
||||
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
|
||||
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
|
||||
res := make([]string, len(creator))
|
||||
for i, s := range creator {
|
||||
auth := exp1.FindStringSubmatch(s)
|
||||
if auth != nil {
|
||||
res[i] = cleanStr(strings.Join(auth[2:], ", "))
|
||||
} else {
|
||||
auth := exp2.FindStringSubmatch(s)
|
||||
if auth != nil {
|
||||
res[i] = cleanStr(auth[1])
|
||||
} else {
|
||||
res[i] = cleanStr(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func parseDescription(description []string) string {
|
||||
str := cleanStr(strings.Join(description, "\n"))
|
||||
str = strings.Replace(str, "</p>", "\n", -1)
|
||||
exp, _ := regexp.Compile("<[^>]*>")
|
||||
str = exp.ReplaceAllString(str, "")
|
||||
str = strings.Replace(str, "&", "&", -1)
|
||||
str = strings.Replace(str, "<", "<", -1)
|
||||
str = strings.Replace(str, ">", ">", -1)
|
||||
str = strings.Replace(str, "\\n", "\n", -1)
|
||||
return str
|
||||
}
|
||||
|
||||
func parseSubject(subject []string) []string {
|
||||
var res []string
|
||||
for _, s := range subject {
|
||||
res = append(res, strings.Split(s, " / ")...)
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
func parseDate(date []string) string {
|
||||
if len(date) == 0 {
|
||||
return ""
|
||||
}
|
||||
return strings.Replace(date[0], "Unspecified: ", "", -1)
|
||||
}
|
Reference in a new issue