Tockenize converting to ascii

This commit is contained in:
Las Zenow 2014-06-29 22:49:50 -05:00
parent 59eaa4e2aa
commit 30af19cd62
3 changed files with 24 additions and 9 deletions

3
README
View file

@ -25,7 +25,8 @@ Yo also need to install go dependences:
# go get labix.org/v2/mgo/bson labix.org/v2/mgo/ github.com/gorilla/sessions \
github.com/gorilla/securecookie github.com/gorilla/mux \
github.com/nfnt/resize github.com/cihub/seelog
github.com/nfnt/resize github.com/cihub/seelog \
gopkgs.com/unidecode.v1
== Installation ==
=== For admins ("for developers" below) ===

View file

@ -9,7 +9,7 @@ import (
)
func buildQuery(q string) bson.M {
var reg []bson.RegEx
var keywords []string
query := bson.M{"active": true}
words := strings.Split(q, " ")
for _, w := range words {
@ -17,11 +17,12 @@ func buildQuery(q string) bson.M {
if len(tag) > 1 {
query[tag[0]] = bson.RegEx{tag[1], "i"}
} else {
reg = append(reg, bson.RegEx{w, "i"})
toks := tokens(w)
keywords = append(keywords, toks...)
}
}
if len(reg) > 0 {
query["keywords"] = bson.M{"$all": reg}
if len(keywords) > 0 {
query["keywords"] = bson.M{"$all": keywords}
}
return query
}

View file

@ -4,11 +4,13 @@ import (
"bytes"
"git.gitorious.org/go-pkg/epubgo.git"
"git.gitorious.org/trantor/trantor.git/database"
"gopkgs.com/unidecode.v1"
"io"
"io/ioutil"
"labix.org/v2/mgo/bson"
"regexp"
"strings"
"unicode"
)
func OpenBook(id bson.ObjectId, db *database.DB) (*epubgo.Epub, error) {
@ -116,14 +118,25 @@ func parseDate(date []string) string {
func keywords(b map[string]interface{}) (k []string) {
title, _ := b["title"].(string)
k = strings.Split(title, " ")
k = tokens(title)
author, _ := b["author"].([]string)
for _, a := range author {
k = append(k, strings.Split(a, " ")...)
k = append(k, tokens(a)...)
}
publisher, _ := b["publisher"].(string)
k = append(k, strings.Split(publisher, " ")...)
k = append(k, tokens(publisher)...)
subject, _ := b["subject"].([]string)
k = append(k, subject...)
for _, s := range subject {
k = append(k, tokens(s)...)
}
return
}
func tokens(str string) []string {
str = unidecode.Unidecode(str)
str = strings.ToLower(str)
f := func(r rune) bool {
return unicode.IsControl(r) || unicode.IsPunct(r) || unicode.IsSpace(r)
}
return strings.FieldsFunc(str, f)
}