71 lines
1.3 KiB
Go
71 lines
1.3 KiB
Go
package parser
|
|
|
|
import (
|
|
"strings"
|
|
|
|
"github.com/abadojack/whatlanggo"
|
|
"github.com/meskio/epubgo"
|
|
"github.com/microcosm-cc/bluemonday"
|
|
)
|
|
|
|
func GuessLang(epub *epubgo.Epub, origLangs []string) string {
|
|
cleaner := bluemonday.StrictPolicy()
|
|
spine, err := epub.Spine()
|
|
if err != nil {
|
|
return normalizeLangs(origLangs)
|
|
}
|
|
|
|
var errSpine error
|
|
errSpine = nil
|
|
langs := []string{}
|
|
for errSpine == nil {
|
|
html, err := spine.Open()
|
|
errSpine = spine.Next()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
defer html.Close()
|
|
|
|
buff := cleaner.SanitizeReader(html)
|
|
info := whatlanggo.Detect(buff.String())
|
|
if info.Confidence >= whatlanggo.ReliableConfidenceThreshold {
|
|
langs = append(langs, info.Lang.Iso6391())
|
|
}
|
|
}
|
|
|
|
lang := commonLang(langs)
|
|
if lang == "un" {
|
|
return normalizeLangs(origLangs)
|
|
}
|
|
return normalizeLangs([]string{lang})
|
|
}
|
|
|
|
func commonLang(langs []string) string {
|
|
count := map[string]int{}
|
|
for _, l := range langs {
|
|
count[l]++
|
|
}
|
|
|
|
lang := "un"
|
|
maxcount := 0
|
|
for l, c := range count {
|
|
if c > maxcount && l != "un" {
|
|
lang = l
|
|
maxcount = c
|
|
}
|
|
}
|
|
return lang
|
|
}
|
|
|
|
func normalizeLangs(langs []string) string {
|
|
if len(langs) == 0 {
|
|
return "un"
|
|
}
|
|
|
|
lang := strings.Split(langs[0], "-")[0]
|
|
if len(lang) > 3 {
|
|
lang = lang[0:2]
|
|
}
|
|
lang = strings.ToLower(lang)
|
|
return lang
|
|
}
|