Use cld2 for language guessing

This commit is contained in:
Las Zenow 2015-01-22 23:02:00 -06:00
parent 6993fcb9e8
commit efcc9cdf8e
4 changed files with 91 additions and 5 deletions

8
README
View file

@ -26,14 +26,15 @@ Yo also need to install go dependences:
# go get gopkg.in/mgo.v2 gopkg.in/mgo.v2/bson github.com/gorilla/sessions \ # go get gopkg.in/mgo.v2 gopkg.in/mgo.v2/bson github.com/gorilla/sessions \
github.com/gorilla/securecookie github.com/gorilla/mux \ github.com/gorilla/securecookie github.com/gorilla/mux \
github.com/nfnt/resize github.com/cihub/seelog \ github.com/nfnt/resize github.com/cihub/seelog \
code.google.com/p/go.crypto/scrypt code.google.com/p/go.crypto/scrypt \
github.com/rainycape/cld2
== Installation == == Installation ==
=== For admins ("for developers" below) === === For admins ("for developers" below) ===
Now you can install Trantor itself: Now you can install Trantor itself:
# go get git.gitorious.org/trantor/trantor.git # go get -tags prod git.gitorious.org/trantor/trantor.git
You can run trantor in /srv/www/trantor i.e. For this: You can run trantor in /srv/www/trantor i.e. For this:
@ -63,7 +64,8 @@ $ cd yournames-trantor
You can edit config.go if you want to change the port and other configuration, by default is 8080 You can edit config.go if you want to change the port and other configuration, by default is 8080
Now you can compile Trantor: Now you can compile Trantor:
$ go build $ go build -tags prod
(remove '-tags prod' for a faster compilation without language guessing)
Now you can run it: Now you can run it:
$ ./yourname-trantor $ ./yourname-trantor

68
language.go Normal file
View file

@ -0,0 +1,68 @@
// +build prod
package main
import (
"io/ioutil"
"strings"
"git.gitorious.org/go-pkg/epubgo.git"
"github.com/rainycape/cld2"
)
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
spine, err := epub.Spine()
if err != nil {
return orig_langs
}
var err_spine error
err_spine = nil
langs := []string{}
for err_spine == nil {
html, err := spine.Open()
err_spine = spine.Next()
if err != nil {
continue
}
defer html.Close()
buff, err := ioutil.ReadAll(html)
if err != nil {
continue
}
langs = append(langs, cld2.Detect(string(buff)))
}
lang := commonLang(langs)
if lang != "un" && differentLang(lang, orig_langs) {
return []string{lang}
}
return orig_langs
}
func commonLang(langs []string) string {
count := map[string]int{}
for _, l := range langs {
count[l]++
}
lang := "un"
maxcount := 0
for l, c := range count {
if c > maxcount && l != "un" {
lang = l
maxcount = c
}
}
return lang
}
func differentLang(lang string, orig_langs []string) bool {
orig_lang := "un"
if len(orig_langs) > 0 && len(orig_langs) >= 2 {
orig_lang = strings.ToLower(orig_langs[0][0:2])
}
return orig_lang != lang
}

16
language_develop.go Normal file
View file

@ -0,0 +1,16 @@
// +build !prod
// This is a dummy implementation of GuessLang used to make the compilation faster on development
//
// To build trantor with the proper language guessing do:
// $ go build -tags prod
package main
import (
"git.gitorious.org/go-pkg/epubgo.git"
)
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
return orig_langs
}

View file

@ -124,7 +124,7 @@ func parseFile(epub *epubgo.Epub, store *storage.Store) (metadata map[string]int
case "date": case "date":
book[m] = parseDate(data) book[m] = parseDate(data)
case "language": case "language":
book["lang"] = data book["lang"] = GuessLang(epub, data)
case "title", "contributor", "publisher": case "title", "contributor", "publisher":
book[m] = cleanStr(strings.Join(data, ", ")) book[m] = cleanStr(strings.Join(data, ", "))
case "identifier": case "identifier":
@ -140,7 +140,7 @@ func parseFile(epub *epubgo.Epub, store *storage.Store) (metadata map[string]int
} }
id = genId() id = genId()
book["id"] = id //TODO book["id"] = id
book["cover"] = GetCover(epub, id, store) book["cover"] = GetCover(epub, id, store)
return book, id return book, id
} }