This repository has been archived on 2025-03-01. You can view files and clone it, but cannot push or open issues or pull requests.
trantor/lib/parser/language.go

74 lines
1.1 KiB
Go
Raw Normal View History

package parser
2015-01-22 23:02:00 -06:00
import (
"io/ioutil"
"strings"
"unicode/utf8"
2015-01-22 23:02:00 -06:00
2017-02-04 12:43:25 +00:00
"github.com/jmhodges/gocld2"
2015-04-21 21:32:01 -04:00
"github.com/meskio/epubgo"
2015-01-22 23:02:00 -06:00
)
func GuessLang(epub *epubgo.Epub, origLangs []string) string {
2015-01-22 23:02:00 -06:00
spine, err := epub.Spine()
if err != nil {
return normalizeLangs(origLangs)
2015-01-22 23:02:00 -06:00
}
var errSpine error
errSpine = nil
2015-01-22 23:02:00 -06:00
langs := []string{}
for errSpine == nil {
2015-01-22 23:02:00 -06:00
html, err := spine.Open()
errSpine = spine.Next()
2015-01-22 23:02:00 -06:00
if err != nil {
continue
}
defer html.Close()
buff, err := ioutil.ReadAll(html)
if err != nil {
continue
}
if utf8.Valid(buff) {
langs = append(langs, cld2.Detect(string(buff)))
}
2015-01-22 23:02:00 -06:00
}
lang := commonLang(langs)
if lang == "un" {
return normalizeLangs(origLangs)
2015-01-22 23:02:00 -06:00
}
return lang
2015-01-22 23:02:00 -06:00
}
func commonLang(langs []string) string {
count := map[string]int{}
for _, l := range langs {
count[l]++
}
lang := "un"
maxcount := 0
for l, c := range count {
if c > maxcount && l != "un" {
lang = l
maxcount = c
}
}
return lang
}
func normalizeLangs(langs []string) string {
2017-05-14 11:11:51 +00:00
if len(langs) == 0 {
return "un"
2015-01-22 23:02:00 -06:00
}
2017-05-14 11:11:51 +00:00
lang := langs[0]
if len(lang) > 3 {
lang = lang[0:2]
}
lang = strings.ToLower(lang)
return lang
2015-01-22 23:02:00 -06:00
}