[WIP] migration to psql

TODO:
[ ] stats
[ ] indexes
This commit is contained in:
Las Zenow 2016-07-30 07:10:33 -04:00
parent e1bd235785
commit e72de38725
24 changed files with 648 additions and 936 deletions

View file

@ -3,23 +3,24 @@ package parser
import (
"io/ioutil"
"strings"
"unicode/utf8"
"github.com/jmhodges/gocld2"
"github.com/meskio/epubgo"
)
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
func GuessLang(epub *epubgo.Epub, origLangs []string) string {
spine, err := epub.Spine()
if err != nil {
return orig_langs
return normalizeLangs(origLangs)
}
var err_spine error
err_spine = nil
var errSpine error
errSpine = nil
langs := []string{}
for err_spine == nil {
for errSpine == nil {
html, err := spine.Open()
err_spine = spine.Next()
errSpine = spine.Next()
if err != nil {
continue
}
@ -29,14 +30,16 @@ func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
if err != nil {
continue
}
langs = append(langs, cld2.Detect(string(buff)))
if utf8.Valid(buff) {
langs = append(langs, cld2.Detect(string(buff)))
}
}
lang := commonLang(langs)
if lang != "un" && differentLang(lang, orig_langs) {
return []string{lang}
if lang == "un" {
return normalizeLangs(origLangs)
}
return orig_langs
return lang
}
func commonLang(langs []string) string {
@ -56,11 +59,14 @@ func commonLang(langs []string) string {
return lang
}
func differentLang(lang string, orig_langs []string) bool {
orig_lang := "un"
if len(orig_langs) > 0 && len(orig_langs) >= 2 {
orig_lang = strings.ToLower(orig_langs[0][0:2])
func normalizeLangs(langs []string) string {
lang := "un"
if len(langs) > 0 {
lang = langs[0]
if len(lang) > 3 {
lang = lang[0:2]
}
lang = strings.ToLower(lang)
}
return orig_lang != lang
return "un"
}

View file

@ -5,45 +5,46 @@ import (
"strings"
"github.com/meskio/epubgo"
"gitlab.com/trantor/trantor/lib/database"
)
type MetaData map[string]interface{}
func EpubMetadata(epub *epubgo.Epub) MetaData {
metadata := MetaData{}
func EpubMetadata(epub *epubgo.Epub) database.Book {
book := database.Book{}
for _, m := range epub.MetadataFields() {
data, err := epub.Metadata(m)
if err != nil {
continue
}
switch m {
case "title":
book.Title = cleanStr(strings.Join(data, ", "))
case "creator":
metadata["author"] = parseAuthr(data)
book.Author = parseAuthr(data)
case "contributor":
book.Contributor = cleanStr(strings.Join(data, ", "))
case "publisher":
book.Publisher = cleanStr(strings.Join(data, ", "))
case "description":
metadata[m] = parseDescription(data)
book.Description = parseDescription(data)
case "subject":
metadata[m] = parseSubject(data)
book.Subject = parseSubject(data)
case "date":
metadata[m] = parseDate(data)
book.Date = parseDate(data)
case "language":
metadata["lang"] = GuessLang(epub, data)
case "title", "contributor", "publisher":
metadata[m] = cleanStr(strings.Join(data, ", "))
book.Lang = GuessLang(epub, data)
case "identifier":
attr, _ := epub.MetadataAttr(m)
for i, d := range data {
if attr[i]["scheme"] == "ISBN" {
isbn := ISBN(d)
if isbn != "" {
metadata["isbn"] = isbn
book.Isbn = isbn
}
}
}
default:
metadata[m] = strings.Join(data, ", ")
}
}
return metadata
return book
}
func cleanStr(str string) string {
@ -88,9 +89,21 @@ func parseDescription(description []string) string {
}
func parseSubject(subject []string) []string {
var res []string
for _, s := range subject {
res = append(res, strings.Split(s, " / ")...)
parsed := subject
for _, sep := range []string{"/", ","} {
p2 := []string{}
for _, s := range subject {
p2 = append(p2, strings.Split(s, sep)...)
}
parsed = p2
}
res := []string{}
for _, s := range parsed {
sub := strings.Trim(s, " ")
sub = strings.ToLower(sub)
if len(sub) != 0 {
res = append(res, sub)
}
}
return res
}