73 lines
1.1 KiB
Go
73 lines
1.1 KiB
Go
package parser
|
|
|
|
import (
|
|
"io/ioutil"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"github.com/jmhodges/gocld2"
|
|
"github.com/meskio/epubgo"
|
|
)
|
|
|
|
func GuessLang(epub *epubgo.Epub, origLangs []string) string {
|
|
spine, err := epub.Spine()
|
|
if err != nil {
|
|
return normalizeLangs(origLangs)
|
|
}
|
|
|
|
var errSpine error
|
|
errSpine = nil
|
|
langs := []string{}
|
|
for errSpine == nil {
|
|
html, err := spine.Open()
|
|
errSpine = spine.Next()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
defer html.Close()
|
|
|
|
buff, err := ioutil.ReadAll(html)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if utf8.Valid(buff) {
|
|
langs = append(langs, cld2.Detect(string(buff)))
|
|
}
|
|
}
|
|
|
|
lang := commonLang(langs)
|
|
if lang == "un" {
|
|
return normalizeLangs(origLangs)
|
|
}
|
|
return lang
|
|
}
|
|
|
|
func commonLang(langs []string) string {
|
|
count := map[string]int{}
|
|
for _, l := range langs {
|
|
count[l]++
|
|
}
|
|
|
|
lang := "un"
|
|
maxcount := 0
|
|
for l, c := range count {
|
|
if c > maxcount && l != "un" {
|
|
lang = l
|
|
maxcount = c
|
|
}
|
|
}
|
|
return lang
|
|
}
|
|
|
|
func normalizeLangs(langs []string) string {
|
|
if len(langs) == 0 {
|
|
return "un"
|
|
}
|
|
|
|
lang := langs[0]
|
|
if len(lang) > 3 {
|
|
lang = lang[0:2]
|
|
}
|
|
lang = strings.ToLower(lang)
|
|
return lang
|
|
}
|