Use cld2 for language guessing
This commit is contained in:
parent
6993fcb9e8
commit
efcc9cdf8e
4 changed files with 91 additions and 5 deletions
8
README
8
README
|
@ -26,14 +26,15 @@ Yo also need to install go dependences:
|
||||||
# go get gopkg.in/mgo.v2 gopkg.in/mgo.v2/bson github.com/gorilla/sessions \
|
# go get gopkg.in/mgo.v2 gopkg.in/mgo.v2/bson github.com/gorilla/sessions \
|
||||||
github.com/gorilla/securecookie github.com/gorilla/mux \
|
github.com/gorilla/securecookie github.com/gorilla/mux \
|
||||||
github.com/nfnt/resize github.com/cihub/seelog \
|
github.com/nfnt/resize github.com/cihub/seelog \
|
||||||
code.google.com/p/go.crypto/scrypt
|
code.google.com/p/go.crypto/scrypt \
|
||||||
|
github.com/rainycape/cld2
|
||||||
|
|
||||||
== Installation ==
|
== Installation ==
|
||||||
=== For admins ("for developers" below) ===
|
=== For admins ("for developers" below) ===
|
||||||
|
|
||||||
Now you can install Trantor itself:
|
Now you can install Trantor itself:
|
||||||
|
|
||||||
# go get git.gitorious.org/trantor/trantor.git
|
# go get -tags prod git.gitorious.org/trantor/trantor.git
|
||||||
|
|
||||||
You can run trantor in /srv/www/trantor i.e. For this:
|
You can run trantor in /srv/www/trantor i.e. For this:
|
||||||
|
|
||||||
|
@ -63,7 +64,8 @@ $ cd yournames-trantor
|
||||||
You can edit config.go if you want to change the port and other configuration, by default is 8080
|
You can edit config.go if you want to change the port and other configuration, by default is 8080
|
||||||
|
|
||||||
Now you can compile Trantor:
|
Now you can compile Trantor:
|
||||||
$ go build
|
$ go build -tags prod
|
||||||
|
(remove '-tags prod' for a faster compilation without language guessing)
|
||||||
|
|
||||||
Now you can run it:
|
Now you can run it:
|
||||||
$ ./yourname-trantor
|
$ ./yourname-trantor
|
||||||
|
|
68
language.go
Normal file
68
language.go
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
// +build prod
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io/ioutil"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"git.gitorious.org/go-pkg/epubgo.git"
|
||||||
|
"github.com/rainycape/cld2"
|
||||||
|
)
|
||||||
|
|
||||||
|
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
|
||||||
|
spine, err := epub.Spine()
|
||||||
|
if err != nil {
|
||||||
|
return orig_langs
|
||||||
|
}
|
||||||
|
|
||||||
|
var err_spine error
|
||||||
|
err_spine = nil
|
||||||
|
langs := []string{}
|
||||||
|
for err_spine == nil {
|
||||||
|
html, err := spine.Open()
|
||||||
|
err_spine = spine.Next()
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
defer html.Close()
|
||||||
|
|
||||||
|
buff, err := ioutil.ReadAll(html)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
langs = append(langs, cld2.Detect(string(buff)))
|
||||||
|
}
|
||||||
|
|
||||||
|
lang := commonLang(langs)
|
||||||
|
if lang != "un" && differentLang(lang, orig_langs) {
|
||||||
|
return []string{lang}
|
||||||
|
}
|
||||||
|
return orig_langs
|
||||||
|
}
|
||||||
|
|
||||||
|
func commonLang(langs []string) string {
|
||||||
|
count := map[string]int{}
|
||||||
|
for _, l := range langs {
|
||||||
|
count[l]++
|
||||||
|
}
|
||||||
|
|
||||||
|
lang := "un"
|
||||||
|
maxcount := 0
|
||||||
|
for l, c := range count {
|
||||||
|
if c > maxcount && l != "un" {
|
||||||
|
lang = l
|
||||||
|
maxcount = c
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return lang
|
||||||
|
}
|
||||||
|
|
||||||
|
func differentLang(lang string, orig_langs []string) bool {
|
||||||
|
orig_lang := "un"
|
||||||
|
if len(orig_langs) > 0 && len(orig_langs) >= 2 {
|
||||||
|
orig_lang = strings.ToLower(orig_langs[0][0:2])
|
||||||
|
}
|
||||||
|
|
||||||
|
return orig_lang != lang
|
||||||
|
}
|
16
language_develop.go
Normal file
16
language_develop.go
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
// +build !prod
|
||||||
|
|
||||||
|
// This is a dummy implementation of GuessLang used to make the compilation faster on development
|
||||||
|
//
|
||||||
|
// To build trantor with the proper language guessing do:
|
||||||
|
// $ go build -tags prod
|
||||||
|
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.gitorious.org/go-pkg/epubgo.git"
|
||||||
|
)
|
||||||
|
|
||||||
|
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
|
||||||
|
return orig_langs
|
||||||
|
}
|
|
@ -124,7 +124,7 @@ func parseFile(epub *epubgo.Epub, store *storage.Store) (metadata map[string]int
|
||||||
case "date":
|
case "date":
|
||||||
book[m] = parseDate(data)
|
book[m] = parseDate(data)
|
||||||
case "language":
|
case "language":
|
||||||
book["lang"] = data
|
book["lang"] = GuessLang(epub, data)
|
||||||
case "title", "contributor", "publisher":
|
case "title", "contributor", "publisher":
|
||||||
book[m] = cleanStr(strings.Join(data, ", "))
|
book[m] = cleanStr(strings.Join(data, ", "))
|
||||||
case "identifier":
|
case "identifier":
|
||||||
|
@ -140,7 +140,7 @@ func parseFile(epub *epubgo.Epub, store *storage.Store) (metadata map[string]int
|
||||||
}
|
}
|
||||||
|
|
||||||
id = genId()
|
id = genId()
|
||||||
book["id"] = id //TODO
|
book["id"] = id
|
||||||
book["cover"] = GetCover(epub, id, store)
|
book["cover"] = GetCover(epub, id, store)
|
||||||
return book, id
|
return book, id
|
||||||
}
|
}
|
||||||
|
|
Reference in a new issue