This repository has been archived on 2025-03-01. You can view files and clone it, but cannot push or open issues or pull requests.
trantor/lib/parser/language.go

72 lines
1.3 KiB
Go
Raw Permalink Normal View History

package parser
2015-01-22 23:02:00 -06:00
import (
"strings"
2021-02-10 15:41:20 +00:00
"github.com/abadojack/whatlanggo"
2015-04-21 21:32:01 -04:00
"github.com/meskio/epubgo"
2021-02-10 15:41:20 +00:00
"github.com/microcosm-cc/bluemonday"
2015-01-22 23:02:00 -06:00
)
func GuessLang(epub *epubgo.Epub, origLangs []string) string {
2021-02-10 15:41:20 +00:00
cleaner := bluemonday.StrictPolicy()
2015-01-22 23:02:00 -06:00
spine, err := epub.Spine()
if err != nil {
return normalizeLangs(origLangs)
2015-01-22 23:02:00 -06:00
}
var errSpine error
errSpine = nil
2015-01-22 23:02:00 -06:00
langs := []string{}
for errSpine == nil {
2015-01-22 23:02:00 -06:00
html, err := spine.Open()
errSpine = spine.Next()
2015-01-22 23:02:00 -06:00
if err != nil {
continue
}
defer html.Close()
2021-02-10 15:41:20 +00:00
buff := cleaner.SanitizeReader(html)
info := whatlanggo.Detect(buff.String())
if info.Confidence >= whatlanggo.ReliableConfidenceThreshold {
langs = append(langs, info.Lang.Iso6391())
}
2015-01-22 23:02:00 -06:00
}
lang := commonLang(langs)
if lang == "un" {
return normalizeLangs(origLangs)
2015-01-22 23:02:00 -06:00
}
2019-06-05 10:58:11 +00:00
return normalizeLangs([]string{lang})
2015-01-22 23:02:00 -06:00
}
func commonLang(langs []string) string {
count := map[string]int{}
for _, l := range langs {
count[l]++
}
lang := "un"
maxcount := 0
for l, c := range count {
if c > maxcount && l != "un" {
lang = l
maxcount = c
}
}
return lang
}
func normalizeLangs(langs []string) string {
2017-05-14 11:11:51 +00:00
if len(langs) == 0 {
return "un"
2015-01-22 23:02:00 -06:00
}
2017-05-14 11:11:51 +00:00
2019-06-05 10:58:11 +00:00
lang := strings.Split(langs[0], "-")[0]
2017-05-14 11:11:51 +00:00
if len(lang) > 3 {
lang = lang[0:2]
}
lang = strings.ToLower(lang)
return lang
2015-01-22 23:02:00 -06:00
}