This repository has been archived on 2025-03-01. You can view files and clone it, but cannot push or open issues or pull requests.
trantor/lib/parser/language.go
Las Zenow e72de38725 [WIP] migration to psql
TODO:
[ ] stats
[ ] indexes
2017-05-21 10:44:43 +00:00

72 lines
1.1 KiB
Go

package parser
import (
"io/ioutil"
"strings"
"unicode/utf8"
"github.com/jmhodges/gocld2"
"github.com/meskio/epubgo"
)
func GuessLang(epub *epubgo.Epub, origLangs []string) string {
spine, err := epub.Spine()
if err != nil {
return normalizeLangs(origLangs)
}
var errSpine error
errSpine = nil
langs := []string{}
for errSpine == nil {
html, err := spine.Open()
errSpine = spine.Next()
if err != nil {
continue
}
defer html.Close()
buff, err := ioutil.ReadAll(html)
if err != nil {
continue
}
if utf8.Valid(buff) {
langs = append(langs, cld2.Detect(string(buff)))
}
}
lang := commonLang(langs)
if lang == "un" {
return normalizeLangs(origLangs)
}
return lang
}
func commonLang(langs []string) string {
count := map[string]int{}
for _, l := range langs {
count[l]++
}
lang := "un"
maxcount := 0
for l, c := range count {
if c > maxcount && l != "un" {
lang = l
maxcount = c
}
}
return lang
}
func normalizeLangs(langs []string) string {
lang := "un"
if len(langs) > 0 {
lang = langs[0]
if len(lang) > 3 {
lang = lang[0:2]
}
lang = strings.ToLower(lang)
}
return "un"
}