[WIP] migration to psql
TODO: [ ] stats [ ] indexes
This commit is contained in:
parent
e1bd235785
commit
e72de38725
24 changed files with 648 additions and 936 deletions
|
@ -3,23 +3,24 @@ package parser
|
|||
import (
|
||||
"io/ioutil"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/jmhodges/gocld2"
|
||||
"github.com/meskio/epubgo"
|
||||
)
|
||||
|
||||
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
|
||||
func GuessLang(epub *epubgo.Epub, origLangs []string) string {
|
||||
spine, err := epub.Spine()
|
||||
if err != nil {
|
||||
return orig_langs
|
||||
return normalizeLangs(origLangs)
|
||||
}
|
||||
|
||||
var err_spine error
|
||||
err_spine = nil
|
||||
var errSpine error
|
||||
errSpine = nil
|
||||
langs := []string{}
|
||||
for err_spine == nil {
|
||||
for errSpine == nil {
|
||||
html, err := spine.Open()
|
||||
err_spine = spine.Next()
|
||||
errSpine = spine.Next()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
@ -29,14 +30,16 @@ func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
|
|||
if err != nil {
|
||||
continue
|
||||
}
|
||||
langs = append(langs, cld2.Detect(string(buff)))
|
||||
if utf8.Valid(buff) {
|
||||
langs = append(langs, cld2.Detect(string(buff)))
|
||||
}
|
||||
}
|
||||
|
||||
lang := commonLang(langs)
|
||||
if lang != "un" && differentLang(lang, orig_langs) {
|
||||
return []string{lang}
|
||||
if lang == "un" {
|
||||
return normalizeLangs(origLangs)
|
||||
}
|
||||
return orig_langs
|
||||
return lang
|
||||
}
|
||||
|
||||
func commonLang(langs []string) string {
|
||||
|
@ -56,11 +59,14 @@ func commonLang(langs []string) string {
|
|||
return lang
|
||||
}
|
||||
|
||||
func differentLang(lang string, orig_langs []string) bool {
|
||||
orig_lang := "un"
|
||||
if len(orig_langs) > 0 && len(orig_langs) >= 2 {
|
||||
orig_lang = strings.ToLower(orig_langs[0][0:2])
|
||||
func normalizeLangs(langs []string) string {
|
||||
lang := "un"
|
||||
if len(langs) > 0 {
|
||||
lang = langs[0]
|
||||
if len(lang) > 3 {
|
||||
lang = lang[0:2]
|
||||
}
|
||||
lang = strings.ToLower(lang)
|
||||
}
|
||||
|
||||
return orig_lang != lang
|
||||
return "un"
|
||||
}
|
||||
|
|
|
@ -5,45 +5,46 @@ import (
|
|||
"strings"
|
||||
|
||||
"github.com/meskio/epubgo"
|
||||
"gitlab.com/trantor/trantor/lib/database"
|
||||
)
|
||||
|
||||
type MetaData map[string]interface{}
|
||||
|
||||
func EpubMetadata(epub *epubgo.Epub) MetaData {
|
||||
metadata := MetaData{}
|
||||
func EpubMetadata(epub *epubgo.Epub) database.Book {
|
||||
book := database.Book{}
|
||||
for _, m := range epub.MetadataFields() {
|
||||
data, err := epub.Metadata(m)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
switch m {
|
||||
case "title":
|
||||
book.Title = cleanStr(strings.Join(data, ", "))
|
||||
case "creator":
|
||||
metadata["author"] = parseAuthr(data)
|
||||
book.Author = parseAuthr(data)
|
||||
case "contributor":
|
||||
book.Contributor = cleanStr(strings.Join(data, ", "))
|
||||
case "publisher":
|
||||
book.Publisher = cleanStr(strings.Join(data, ", "))
|
||||
case "description":
|
||||
metadata[m] = parseDescription(data)
|
||||
book.Description = parseDescription(data)
|
||||
case "subject":
|
||||
metadata[m] = parseSubject(data)
|
||||
book.Subject = parseSubject(data)
|
||||
case "date":
|
||||
metadata[m] = parseDate(data)
|
||||
book.Date = parseDate(data)
|
||||
case "language":
|
||||
metadata["lang"] = GuessLang(epub, data)
|
||||
case "title", "contributor", "publisher":
|
||||
metadata[m] = cleanStr(strings.Join(data, ", "))
|
||||
book.Lang = GuessLang(epub, data)
|
||||
case "identifier":
|
||||
attr, _ := epub.MetadataAttr(m)
|
||||
for i, d := range data {
|
||||
if attr[i]["scheme"] == "ISBN" {
|
||||
isbn := ISBN(d)
|
||||
if isbn != "" {
|
||||
metadata["isbn"] = isbn
|
||||
book.Isbn = isbn
|
||||
}
|
||||
}
|
||||
}
|
||||
default:
|
||||
metadata[m] = strings.Join(data, ", ")
|
||||
}
|
||||
}
|
||||
return metadata
|
||||
return book
|
||||
}
|
||||
|
||||
func cleanStr(str string) string {
|
||||
|
@ -88,9 +89,21 @@ func parseDescription(description []string) string {
|
|||
}
|
||||
|
||||
func parseSubject(subject []string) []string {
|
||||
var res []string
|
||||
for _, s := range subject {
|
||||
res = append(res, strings.Split(s, " / ")...)
|
||||
parsed := subject
|
||||
for _, sep := range []string{"/", ","} {
|
||||
p2 := []string{}
|
||||
for _, s := range subject {
|
||||
p2 = append(p2, strings.Split(s, sep)...)
|
||||
}
|
||||
parsed = p2
|
||||
}
|
||||
res := []string{}
|
||||
for _, s := range parsed {
|
||||
sub := strings.Trim(s, " ")
|
||||
sub = strings.ToLower(sub)
|
||||
if len(sub) != 0 {
|
||||
res = append(res, sub)
|
||||
}
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
|
Reference in a new issue