This repository has been archived on 2025-03-01. You can view files and clone it, but cannot push or open issues or pull requests.
trantor/lib/parser/parser.go
2017-05-21 10:55:56 +00:00

116 lines
2.7 KiB
Go

package parser
import (
"regexp"
"strings"
"github.com/meskio/epubgo"
"gitlab.com/trantor/trantor/lib/database"
)
func EpubMetadata(epub *epubgo.Epub) database.Book {
book := database.Book{}
for _, m := range epub.MetadataFields() {
data, err := epub.Metadata(m)
if err != nil {
continue
}
switch m {
case "title":
book.Title = cleanStr(strings.Join(data, ", "))
case "creator":
book.Authors = parseAuthr(data)
case "contributor":
book.Contributor = cleanStr(strings.Join(data, ", "))
case "publisher":
book.Publisher = cleanStr(strings.Join(data, ", "))
case "description":
book.Description = parseDescription(data)
case "subject":
book.Tags = parseSubject(data)
case "date":
book.Date = parseDate(data)
case "language":
book.Lang = GuessLang(epub, data)
case "identifier":
attr, _ := epub.MetadataAttr(m)
for i, d := range data {
if attr[i]["scheme"] == "ISBN" {
isbn := ISBN(d)
if isbn != "" {
book.Isbn = isbn
}
}
}
}
}
return book
}
func cleanStr(str string) string {
str = strings.Replace(str, "'", "'", -1)
exp, _ := regexp.Compile("&[^;]*;")
str = exp.ReplaceAllString(str, "")
exp, _ = regexp.Compile("[ ,]*$")
str = exp.ReplaceAllString(str, "")
return str
}
func parseAuthr(creator []string) []string {
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
res := make([]string, len(creator))
for i, s := range creator {
auth := exp1.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(strings.Join(auth[2:], ", "))
} else {
auth := exp2.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(auth[1])
} else {
res[i] = cleanStr(s)
}
}
}
return res
}
func parseDescription(description []string) string {
str := cleanStr(strings.Join(description, "\n"))
str = strings.Replace(str, "</p>", "\n", -1)
exp, _ := regexp.Compile("<[^>]*>")
str = exp.ReplaceAllString(str, "")
str = strings.Replace(str, "&amp;", "&", -1)
str = strings.Replace(str, "&lt;", "<", -1)
str = strings.Replace(str, "&gt;", ">", -1)
str = strings.Replace(str, "\\n", "\n", -1)
return str
}
func parseSubject(subject []string) []string {
parsed := subject
for _, sep := range []string{"/", ","} {
p2 := []string{}
for _, s := range parsed {
p2 = append(p2, strings.Split(s, sep)...)
}
parsed = p2
}
res := []string{}
for _, s := range parsed {
sub := strings.Trim(s, " ")
sub = strings.ToLower(sub)
if len(sub) != 0 {
res = append(res, sub)
}
}
return res
}
func parseDate(date []string) string {
if len(date) == 0 {
return ""
}
return strings.Replace(date[0], "Unspecified: ", "", -1)
}