This repository has been archived on 2025-03-01. You can view files and clone it, but cannot push or open issues or pull requests.
trantor/lib/parser/language.go
2021-02-10 15:41:20 +00:00

71 lines
1.3 KiB
Go

package parser
import (
"strings"
"github.com/abadojack/whatlanggo"
"github.com/meskio/epubgo"
"github.com/microcosm-cc/bluemonday"
)
func GuessLang(epub *epubgo.Epub, origLangs []string) string {
cleaner := bluemonday.StrictPolicy()
spine, err := epub.Spine()
if err != nil {
return normalizeLangs(origLangs)
}
var errSpine error
errSpine = nil
langs := []string{}
for errSpine == nil {
html, err := spine.Open()
errSpine = spine.Next()
if err != nil {
continue
}
defer html.Close()
buff := cleaner.SanitizeReader(html)
info := whatlanggo.Detect(buff.String())
if info.Confidence >= whatlanggo.ReliableConfidenceThreshold {
langs = append(langs, info.Lang.Iso6391())
}
}
lang := commonLang(langs)
if lang == "un" {
return normalizeLangs(origLangs)
}
return normalizeLangs([]string{lang})
}
func commonLang(langs []string) string {
count := map[string]int{}
for _, l := range langs {
count[l]++
}
lang := "un"
maxcount := 0
for l, c := range count {
if c > maxcount && l != "un" {
lang = l
maxcount = c
}
}
return lang
}
func normalizeLangs(langs []string) string {
if len(langs) == 0 {
return "un"
}
lang := strings.Split(langs[0], "-")[0]
if len(lang) > 3 {
lang = lang[0:2]
}
lang = strings.ToLower(lang)
return lang
}