From 30af19cd6227d8e4e22321231b7302c5a2839a21 Mon Sep 17 00:00:00 2001 From: Las Zenow Date: Sun, 29 Jun 2014 22:49:50 -0500 Subject: [PATCH] Tockenize converting to ascii --- README | 3 ++- search.go | 9 +++++---- store.go | 21 +++++++++++++++++---- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/README b/README index 395f6e7..9481fa1 100644 --- a/README +++ b/README @@ -25,7 +25,8 @@ Yo also need to install go dependences: # go get labix.org/v2/mgo/bson labix.org/v2/mgo/ github.com/gorilla/sessions \ github.com/gorilla/securecookie github.com/gorilla/mux \ - github.com/nfnt/resize github.com/cihub/seelog + github.com/nfnt/resize github.com/cihub/seelog \ + gopkgs.com/unidecode.v1 == Installation == === For admins ("for developers" below) === diff --git a/search.go b/search.go index baaa582..b8f81fe 100644 --- a/search.go +++ b/search.go @@ -9,7 +9,7 @@ import ( ) func buildQuery(q string) bson.M { - var reg []bson.RegEx + var keywords []string query := bson.M{"active": true} words := strings.Split(q, " ") for _, w := range words { @@ -17,11 +17,12 @@ func buildQuery(q string) bson.M { if len(tag) > 1 { query[tag[0]] = bson.RegEx{tag[1], "i"} } else { - reg = append(reg, bson.RegEx{w, "i"}) + toks := tokens(w) + keywords = append(keywords, toks...) } } - if len(reg) > 0 { - query["keywords"] = bson.M{"$all": reg} + if len(keywords) > 0 { + query["keywords"] = bson.M{"$all": keywords} } return query } diff --git a/store.go b/store.go index 3b70a34..b7a4ad2 100644 --- a/store.go +++ b/store.go @@ -4,11 +4,13 @@ import ( "bytes" "git.gitorious.org/go-pkg/epubgo.git" "git.gitorious.org/trantor/trantor.git/database" + "gopkgs.com/unidecode.v1" "io" "io/ioutil" "labix.org/v2/mgo/bson" "regexp" "strings" + "unicode" ) func OpenBook(id bson.ObjectId, db *database.DB) (*epubgo.Epub, error) { @@ -116,14 +118,25 @@ func parseDate(date []string) string { func keywords(b map[string]interface{}) (k []string) { title, _ := b["title"].(string) - k = strings.Split(title, " ") + k = tokens(title) author, _ := b["author"].([]string) for _, a := range author { - k = append(k, strings.Split(a, " ")...) + k = append(k, tokens(a)...) } publisher, _ := b["publisher"].(string) - k = append(k, strings.Split(publisher, " ")...) + k = append(k, tokens(publisher)...) subject, _ := b["subject"].([]string) - k = append(k, subject...) + for _, s := range subject { + k = append(k, tokens(s)...) + } return } + +func tokens(str string) []string { + str = unidecode.Unidecode(str) + str = strings.ToLower(str) + f := func(r rune) bool { + return unicode.IsControl(r) || unicode.IsPunct(r) || unicode.IsSpace(r) + } + return strings.FieldsFunc(str, f) +}