From dc21299e19fcf6fc04048098ec3f51435d09dba4 Mon Sep 17 00:00:00 2001 From: Las Zenow Date: Tue, 11 Feb 2014 17:51:59 +0100 Subject: [PATCH] Add importer tool to import books directly from harddisk --- .gitignore | 4 +- tools/README | 2 + tools/importer/config.go | 53 ++++++ tools/importer/cover.go | 202 ++++++++++++++++++++++ tools/importer/database.go | 327 ++++++++++++++++++++++++++++++++++++ tools/importer/importer.go | 65 +++++++ tools/importer/mapreduce.go | 283 +++++++++++++++++++++++++++++++ tools/importer/search.go | 85 ++++++++++ tools/importer/session.go | 81 +++++++++ tools/importer/stats.go | 244 +++++++++++++++++++++++++++ tools/importer/store.go | 128 ++++++++++++++ tools/importer/upload.go | 146 ++++++++++++++++ 12 files changed, 1617 insertions(+), 3 deletions(-) create mode 100644 tools/importer/config.go create mode 100644 tools/importer/cover.go create mode 100644 tools/importer/database.go create mode 100644 tools/importer/importer.go create mode 100644 tools/importer/mapreduce.go create mode 100644 tools/importer/search.go create mode 100644 tools/importer/session.go create mode 100644 tools/importer/stats.go create mode 100644 tools/importer/store.go create mode 100644 tools/importer/upload.go diff --git a/.gitignore b/.gitignore index 9510703..d1005e5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,3 @@ -books/ -new/ -cover/ trantor tools/adduser/adduser tools/update/update @@ -8,5 +5,6 @@ tools/togridfs/togridfs tools/getISBNnDesc/getISBNnDesc tools/coverNew/coverNew tools/addsize/addsize +tools/importer/importer tags .*.swp diff --git a/tools/README b/tools/README index d0b3b4b..84312cd 100644 --- a/tools/README +++ b/tools/README @@ -4,6 +4,8 @@ Some dirty tools to manage trantor: $ adduser myNick Password: +- importer. import all epubs passed as parameter into the database and approve them + - update. Update the cover of all the books. It might be outdated. - togridfs (23/4/2013). Migrate all files and covers to gridfs diff --git a/tools/importer/config.go b/tools/importer/config.go new file mode 100644 index 0000000..6099708 --- /dev/null +++ b/tools/importer/config.go @@ -0,0 +1,53 @@ +package main + +const ( + PORT = "8080" + + DB_IP = "127.0.0.1" + DB_NAME = "trantor" + META_COLL = "meta" + BOOKS_COLL = "books" + TAGS_COLL = "tags" + VISITED_COLL = "visited" + DOWNLOADED_COLL = "downloaded" + HOURLY_VISITS_COLL = "visits.hourly" + DAILY_VISITS_COLL = "visits.daily" + MONTHLY_VISITS_COLL = "visits.monthly" + HOURLY_DOWNLOADS_COLL = "downloads.hourly" + DAILY_DOWNLOADS_COLL = "downloads.daily" + MONTHLY_DOWNLOADS_COLL = "downloads.monthly" + USERS_COLL = "users" + NEWS_COLL = "news" + STATS_COLL = "statistics" + FS_BOOKS = "fs_books" + FS_IMGS = "fs_imgs" + + PASS_SALT = "ImperialLibSalt" + MINUTES_UPDATE_TAGS = 11 + MINUTES_UPDATE_VISITED = 41 + MINUTES_UPDATE_DOWNLOADED = 47 + MINUTES_UPDATE_HOURLY_V = 31 + MINUTES_UPDATE_DAILY_V = 60*12 + 7 + MINUTES_UPDATE_MONTHLY_V = 60*24 + 11 + MINUTES_UPDATE_HOURLY_D = 29 + MINUTES_UPDATE_DAILY_D = 60*12 + 13 + MINUTES_UPDATE_MONTHLY_D = 60*24 + 17 + MINUTES_UPDATE_LOGGER = 5 + TAGS_DISPLAY = 50 + SEARCH_ITEMS_PAGE = 20 + NEW_ITEMS_PAGE = 50 + NUM_NEWS = 10 + DAYS_NEWS_INDEXPAGE = 15 + + TEMPLATE_PATH = "templates/" + CSS_PATH = "css/" + JS_PATH = "js/" + IMG_PATH = "img/" + LOGGER_CONFIG = "logger.xml" + + IMG_WIDTH_BIG = 300 + IMG_WIDTH_SMALL = 60 + IMG_QUALITY = 80 + + CHAN_SIZE = 100 +) diff --git a/tools/importer/cover.go b/tools/importer/cover.go new file mode 100644 index 0000000..c6e3bae --- /dev/null +++ b/tools/importer/cover.go @@ -0,0 +1,202 @@ +package main + +import log "github.com/cihub/seelog" +import _ "image/png" +import _ "image/jpeg" +import _ "image/gif" + +import ( + "bytes" + "git.gitorious.org/go-pkg/epubgo.git" + "github.com/gorilla/mux" + "github.com/nfnt/resize" + "image" + "image/jpeg" + "io" + "io/ioutil" + "labix.org/v2/mgo" + "labix.org/v2/mgo/bson" + "regexp" + "strings" +) + +func coverHandler(h handler) { + vars := mux.Vars(h.r) + if !bson.IsObjectIdHex(vars["id"]) { + notFound(h) + return + } + id := bson.ObjectIdHex(vars["id"]) + books, _, err := h.db.GetBooks(bson.M{"_id": id}) + if err != nil || len(books) == 0 { + notFound(h) + return + } + book := books[0] + + if !book.Active { + if !h.sess.IsAdmin() { + notFound(h) + return + } + } + + fs := h.db.GetFS(FS_IMGS) + var f *mgo.GridFile + if vars["size"] == "small" { + f, err = fs.OpenId(book.CoverSmall) + } else { + f, err = fs.OpenId(book.Cover) + } + if err != nil { + log.Error("Error while opening image: ", err) + notFound(h) + return + } + defer f.Close() + + headers := h.w.Header() + headers["Content-Type"] = []string{"image/jpeg"} + + io.Copy(h.w, f) +} + +func GetCover(e *epubgo.Epub, title string, db *DB) (bson.ObjectId, bson.ObjectId) { + imgId, smallId := coverFromMetadata(e, title, db) + if imgId != "" { + return imgId, smallId + } + + imgId, smallId = searchCommonCoverNames(e, title, db) + if imgId != "" { + return imgId, smallId + } + + /* search for img on the text */ + exp, _ := regexp.Compile("<.*ima?g.*[(src)(href)]=[\"']([^\"']*(\\.[^\\.\"']*))[\"']") + it, errNext := e.Spine() + for errNext == nil { + file, err := it.Open() + if err != nil { + break + } + defer file.Close() + + txt, err := ioutil.ReadAll(file) + if err != nil { + break + } + res := exp.FindSubmatch(txt) + if res != nil { + href := string(res[1]) + urlPart := strings.Split(it.URL(), "/") + url := strings.Join(urlPart[:len(urlPart)-1], "/") + if href[:3] == "../" { + href = href[3:] + url = strings.Join(urlPart[:len(urlPart)-2], "/") + } + href = strings.Replace(href, "%20", " ", -1) + href = strings.Replace(href, "%27", "'", -1) + href = strings.Replace(href, "%28", "(", -1) + href = strings.Replace(href, "%29", ")", -1) + if url == "" { + url = href + } else { + url = url + "/" + href + } + + img, err := e.OpenFile(url) + if err == nil { + defer img.Close() + return storeImg(img, title, db) + } + } + errNext = it.Next() + } + return "", "" +} + +func coverFromMetadata(e *epubgo.Epub, title string, db *DB) (bson.ObjectId, bson.ObjectId) { + metaList, _ := e.MetadataAttr("meta") + for _, meta := range metaList { + if meta["name"] == "cover" { + img, err := e.OpenFileId(meta["content"]) + if err == nil { + defer img.Close() + return storeImg(img, title, db) + } + } + } + return "", "" +} + +func searchCommonCoverNames(e *epubgo.Epub, title string, db *DB) (bson.ObjectId, bson.ObjectId) { + for _, p := range []string{"cover.jpg", "Images/cover.jpg", "images/cover.jpg", "cover.jpeg", "cover1.jpg", "cover1.jpeg"} { + img, err := e.OpenFile(p) + if err == nil { + defer img.Close() + return storeImg(img, title, db) + } + } + return "", "" +} + +func storeImg(img io.Reader, title string, db *DB) (bson.ObjectId, bson.ObjectId) { + /* open the files */ + fBig, err := createCoverFile(title, db) + if err != nil { + log.Error("Error creating ", title, ": ", err.Error()) + return "", "" + } + defer fBig.Close() + + fSmall, err := createCoverFile(title+"_small", db) + if err != nil { + log.Error("Error creating ", title+"_small", ": ", err.Error()) + return "", "" + } + defer fSmall.Close() + + /* resize img */ + var img2 bytes.Buffer + img1 := io.TeeReader(img, &img2) + jpgOptions := jpeg.Options{IMG_QUALITY} + imgResized, err := resizeImg(img1, IMG_WIDTH_BIG) + if err != nil { + log.Error("Error resizing big image: ", err.Error()) + return "", "" + } + err = jpeg.Encode(fBig, imgResized, &jpgOptions) + if err != nil { + log.Error("Error encoding big image: ", err.Error()) + return "", "" + } + imgSmallResized, err := resizeImg(&img2, IMG_WIDTH_SMALL) + if err != nil { + log.Error("Error resizing small image: ", err.Error()) + return "", "" + } + err = jpeg.Encode(fSmall, imgSmallResized, &jpgOptions) + if err != nil { + log.Error("Error encoding small image: ", err.Error()) + return "", "" + } + + idBig, _ := fBig.Id().(bson.ObjectId) + idSmall, _ := fSmall.Id().(bson.ObjectId) + return idBig, idSmall +} + +func createCoverFile(title string, db *DB) (*mgo.GridFile, error) { + fs := db.GetFS(FS_IMGS) + return fs.Create(title + ".jpg") +} + +func resizeImg(imgReader io.Reader, width uint) (image.Image, error) { + img, _, err := image.Decode(imgReader) + if err != nil { + return nil, err + } + + return resize.Resize(width, 0, img, resize.NearestNeighbor), nil +} diff --git a/tools/importer/database.go b/tools/importer/database.go new file mode 100644 index 0000000..e49b644 --- /dev/null +++ b/tools/importer/database.go @@ -0,0 +1,327 @@ +package main + +import log "github.com/cihub/seelog" + +import ( + "crypto/md5" + "labix.org/v2/mgo" + "labix.org/v2/mgo/bson" + "os" + "time" +) + +type Book struct { + Id string `bson:"_id"` + Title string + Author []string + Contributor string + Publisher string + Description string + Subject []string + Date string + Lang []string + Isbn string + Type string + Format string + Source string + Relation string + Coverage string + Rights string + Meta string + File bson.ObjectId + FileSize int + Cover bson.ObjectId + CoverSmall bson.ObjectId + Active bool + Keywords []string +} + +type News struct { + Date time.Time + Text string +} + +type DB struct { + session *mgo.Session +} + +func initDB() *DB { + var err error + d := new(DB) + d.session, err = mgo.Dial(DB_IP) + if err != nil { + log.Critical(err) + os.Exit(1) + } + return d +} + +func (d *DB) Close() { + d.session.Close() +} + +func (d *DB) Copy() *DB { + dbCopy := new(DB) + dbCopy.session = d.session.Copy() + return dbCopy +} + +func md5Pass(pass string) []byte { + h := md5.New() + hash := h.Sum(([]byte)(PASS_SALT + pass)) + return hash +} + +func (d *DB) SetPassword(user string, pass string) error { + hash := md5Pass(pass) + userColl := d.session.DB(DB_NAME).C(USERS_COLL) + return userColl.Update(bson.M{"user": user}, bson.M{"$set": bson.M{"pass": hash}}) +} + +func (d *DB) UserValid(user string, pass string) bool { + hash := md5Pass(pass) + userColl := d.session.DB(DB_NAME).C(USERS_COLL) + n, err := userColl.Find(bson.M{"user": user, "pass": hash}).Count() + if err != nil { + return false + } + return n != 0 +} + +func (d *DB) UserRole(user string) string { + type result struct { + Role string + } + res := result{} + userColl := d.session.DB(DB_NAME).C(USERS_COLL) + err := userColl.Find(bson.M{"user": user}).One(&res) + if err != nil { + return "" + } + return res.Role +} + +func (d *DB) AddNews(text string) error { + var news News + news.Text = text + news.Date = time.Now() + newsColl := d.session.DB(DB_NAME).C(NEWS_COLL) + return newsColl.Insert(news) +} + +func (d *DB) GetNews(num int, days int) (news []News, err error) { + query := bson.M{} + if days != 0 { + duration := time.Duration(-24*days) * time.Hour + date := time.Now().Add(duration) + query = bson.M{"date": bson.M{"$gt": date}} + } + newsColl := d.session.DB(DB_NAME).C(NEWS_COLL) + q := newsColl.Find(query).Sort("-date").Limit(num) + err = q.All(&news) + return +} + +func (d *DB) InsertStats(stats interface{}) error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + return statsColl.Insert(stats) +} + +func (d *DB) InsertBook(book interface{}) error { + booksColl := d.session.DB(DB_NAME).C(BOOKS_COLL) + return booksColl.Insert(book) +} + +func (d *DB) RemoveBook(id bson.ObjectId) error { + booksColl := d.session.DB(DB_NAME).C(BOOKS_COLL) + return booksColl.Remove(bson.M{"_id": id}) +} + +func (d *DB) UpdateBook(id bson.ObjectId, data interface{}) error { + booksColl := d.session.DB(DB_NAME).C(BOOKS_COLL) + return booksColl.Update(bson.M{"_id": id}, bson.M{"$set": data}) +} + +/* optional parameters: length and start index + * + * Returns: list of books, number found and err + */ +func (d *DB) GetBooks(query bson.M, r ...int) (books []Book, num int, err error) { + var start, length int + if len(r) > 0 { + length = r[0] + if len(r) > 1 { + start = r[1] + } + } + booksColl := d.session.DB(DB_NAME).C(BOOKS_COLL) + q := booksColl.Find(query).Sort("-_id") + num, err = q.Count() + if err != nil { + return + } + if start != 0 { + q = q.Skip(start) + } + if length != 0 { + q = q.Limit(length) + } + + err = q.All(&books) + for i, b := range books { + books[i].Id = bson.ObjectId(b.Id).Hex() + } + return +} + +/* Get the most visited books + */ +func (d *DB) GetVisitedBooks(num int) (books []Book, err error) { + visitedColl := d.session.DB(DB_NAME).C(VISITED_COLL) + bookId, err := GetBooksVisited(num, visitedColl) + if err != nil { + return nil, err + } + + books = make([]Book, num) + for i, id := range bookId { + booksColl := d.session.DB(DB_NAME).C(BOOKS_COLL) + booksColl.Find(bson.M{"_id": id}).One(&books[i]) + books[i].Id = bson.ObjectId(books[i].Id).Hex() + } + return +} + +func (d *DB) UpdateMostVisited() error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateMostVisited(statsColl) +} + +/* Get the most downloaded books + */ +func (d *DB) GetDownloadedBooks(num int) (books []Book, err error) { + downloadedColl := d.session.DB(DB_NAME).C(DOWNLOADED_COLL) + bookId, err := GetBooksVisited(num, downloadedColl) + if err != nil { + return nil, err + } + + books = make([]Book, num) + for i, id := range bookId { + booksColl := d.session.DB(DB_NAME).C(BOOKS_COLL) + booksColl.Find(bson.M{"_id": id}).One(&books[i]) + books[i].Id = bson.ObjectId(books[i].Id).Hex() + } + return +} + +func (d *DB) UpdateDownloadedBooks() error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateMostDownloaded(statsColl) +} + +/* optional parameters: length and start index + * + * Returns: list of books, number found and err + */ +func (d *DB) GetNewBooks(r ...int) (books []Book, num int, err error) { + return d.GetBooks(bson.M{"$nor": []bson.M{{"active": true}}}, r...) +} + +func (d *DB) BookActive(id bson.ObjectId) bool { + var book Book + booksColl := d.session.DB(DB_NAME).C(BOOKS_COLL) + err := booksColl.Find(bson.M{"_id": id}).One(&book) + if err != nil { + return false + } + return book.Active +} + +func (d *DB) GetFS(prefix string) *mgo.GridFS { + return d.session.DB(DB_NAME).GridFS(prefix) +} + +func (d *DB) GetTags(numTags int) ([]string, error) { + tagsColl := d.session.DB(DB_NAME).C(TAGS_COLL) + return GetTags(numTags, tagsColl) +} + +func (d *DB) UpdateTags() error { + booksColl := d.session.DB(DB_NAME).C(BOOKS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateTags(booksColl) +} + +type Visits struct { + Date int64 "_id" + Count int "value" +} + +func (d *DB) GetHourVisits() ([]Visits, error) { + hourlyColl := d.session.DB(DB_NAME).C(HOURLY_VISITS_COLL) + return GetVisits(hourlyColl) +} + +func (d *DB) UpdateHourVisits() error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateHourVisits(statsColl) +} + +func (d *DB) GetDayVisits() ([]Visits, error) { + dailyColl := d.session.DB(DB_NAME).C(DAILY_VISITS_COLL) + return GetVisits(dailyColl) +} + +func (d *DB) UpdateDayVisits() error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateDayVisits(statsColl) +} + +func (d *DB) GetMonthVisits() ([]Visits, error) { + monthlyColl := d.session.DB(DB_NAME).C(MONTHLY_VISITS_COLL) + return GetVisits(monthlyColl) +} + +func (d *DB) UpdateMonthVisits() error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateMonthVisits(statsColl) +} + +func (d *DB) GetHourDownloads() ([]Visits, error) { + hourlyColl := d.session.DB(DB_NAME).C(HOURLY_DOWNLOADS_COLL) + return GetVisits(hourlyColl) +} + +func (d *DB) UpdateHourDownloads() error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateHourDownloads(statsColl) +} + +func (d *DB) GetDayDownloads() ([]Visits, error) { + dailyColl := d.session.DB(DB_NAME).C(DAILY_DOWNLOADS_COLL) + return GetVisits(dailyColl) +} + +func (d *DB) UpdateDayDownloads() error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateDayDownloads(statsColl) +} + +func (d *DB) GetMonthDownloads() ([]Visits, error) { + monthlyColl := d.session.DB(DB_NAME).C(MONTHLY_DOWNLOADS_COLL) + return GetVisits(monthlyColl) +} + +func (d *DB) UpdateMonthDownloads() error { + statsColl := d.session.DB(DB_NAME).C(STATS_COLL) + mr := NewMR(d.session.DB(DB_NAME)) + return mr.UpdateMonthDownloads(statsColl) +} diff --git a/tools/importer/importer.go b/tools/importer/importer.go new file mode 100644 index 0000000..b31282f --- /dev/null +++ b/tools/importer/importer.go @@ -0,0 +1,65 @@ +package main + +import log "github.com/cihub/seelog" + +import ( + "git.gitorious.org/go-pkg/epubgo.git" + "net/http" + "os" +) + +func main() { + db := initDB() + defer db.Close() + + for _, file := range os.Args[1:len(os.Args)] { + uploadEpub(file, db) + } +} + +func uploadEpub(filename string, db *DB) { + epub, err := epubgo.Open(filename) + if err != nil { + log.Error("Not valid epub '", filename, "': ", err) + return + } + defer epub.Close() + + book := parseFile(epub, db) + title, _ := book["title"].(string) + _, numTitleFound, _ := db.GetBooks(buildQuery("title:"+title), 1) + if numTitleFound == 0 { + book["active"] = true + } + + file, _ := os.Open(filename) + defer file.Close() + id, size, err := StoreNewFile(title+".epub", file, db) + if err != nil { + log.Error("Error storing book (", title, "): ", err) + return + } + + book["filename"] = id + book["filenamesize"] = size + err = db.InsertBook(book) + if err != nil { + log.Error("Error storing metadata (", title, "): ", err) + return + } + log.Info("File uploaded: ", filename) +} + +type Status struct { + Upload bool + Stats bool + Search string +} + +func GetStatus(h handler) Status { + return Status{} +} + +func loadTemplate(w http.ResponseWriter, tmpl string, data interface{}) {} +func loadTxtTemplate(w http.ResponseWriter, tmpl string, data interface{}) {} +func notFound(h handler) {} diff --git a/tools/importer/mapreduce.go b/tools/importer/mapreduce.go new file mode 100644 index 0000000..c24deec --- /dev/null +++ b/tools/importer/mapreduce.go @@ -0,0 +1,283 @@ +package main + +import ( + "labix.org/v2/mgo" + "labix.org/v2/mgo/bson" + "time" +) + +func GetTags(numTags int, tagsColl *mgo.Collection) ([]string, error) { + var result []struct { + Tag string "_id" + } + err := tagsColl.Find(nil).Sort("-value").Limit(numTags).All(&result) + if err != nil { + return nil, err + } + + tags := make([]string, len(result)) + for i, r := range result { + tags[i] = r.Tag + } + return tags, nil +} + +func GetBooksVisited(num int, visitedColl *mgo.Collection) ([]bson.ObjectId, error) { + var result []struct { + Book bson.ObjectId "_id" + } + err := visitedColl.Find(nil).Sort("-value").Limit(num).All(&result) + if err != nil { + return nil, err + } + + books := make([]bson.ObjectId, len(result)) + for i, r := range result { + books[i] = r.Book + } + return books, nil +} + +func GetVisits(visitsColl *mgo.Collection) ([]Visits, error) { + var result []Visits + err := visitsColl.Find(nil).All(&result) + return result, err +} + +type MR struct { + database *mgo.Database +} + +func NewMR(database *mgo.Database) *MR { + m := new(MR) + m.database = database + return m +} + +func (m *MR) UpdateTags(booksColl *mgo.Collection) error { + var mr mgo.MapReduce + mr.Map = `function() { + if (this.subject) { + this.subject.forEach(function(s) { emit(s, 1); }); + } + }` + mr.Reduce = `function(tag, vals) { + var count = 0; + vals.forEach(function() { count += 1; }); + return count; + }` + return m.update(&mr, bson.M{"active": true}, booksColl, TAGS_COLL) +} + +func (m *MR) UpdateMostVisited(statsColl *mgo.Collection) error { + var mr mgo.MapReduce + mr.Map = `function() { + if (this.id) { + emit(this.id, 1); + } + }` + mr.Reduce = `function(tag, vals) { + var count = 0; + vals.forEach(function() { count += 1; }); + return count; + }` + return m.update(&mr, bson.M{"section": "book"}, statsColl, VISITED_COLL) +} + +func (m *MR) UpdateMostDownloaded(statsColl *mgo.Collection) error { + var mr mgo.MapReduce + mr.Map = `function() { + emit(this.id, 1); + }` + mr.Reduce = `function(tag, vals) { + var count = 0; + vals.forEach(function() { count += 1; }); + return count; + }` + return m.update(&mr, bson.M{"section": "download"}, statsColl, DOWNLOADED_COLL) +} + +func (m *MR) UpdateHourVisits(statsColl *mgo.Collection) error { + const numDays = 2 + start := time.Now().UTC().Add(-numDays * 24 * time.Hour) + + const reduce = `function(date, vals) { + var count = 0; + vals.forEach(function(v) { count += v; }); + return count; + }` + var mr mgo.MapReduce + mr.Map = `function() { + var date = Date.UTC(this.date.getUTCFullYear(), + this.date.getUTCMonth(), + this.date.getUTCDate(), + this.date.getUTCHours()); + emit({date: date, session: this.session}, 1); + }` + mr.Reduce = reduce + err := m.update(&mr, bson.M{"date": bson.M{"$gte": start}}, statsColl, HOURLY_VISITS_COLL+"_raw") + if err != nil { + return err + } + var mr2 mgo.MapReduce + mr2.Map = `function() { + emit(this['_id']['date'], 1); + }` + mr2.Reduce = reduce + hourly_raw := m.database.C(HOURLY_VISITS_COLL + "_raw") + return m.update(&mr2, bson.M{}, hourly_raw, HOURLY_VISITS_COLL) +} + +func (m *MR) UpdateDayVisits(statsColl *mgo.Collection) error { + const numDays = 30 + start := time.Now().UTC().Add(-numDays * 24 * time.Hour).Truncate(24 * time.Hour) + + const reduce = `function(date, vals) { + var count = 0; + vals.forEach(function(v) { count += v; }); + return count; + }` + var mr mgo.MapReduce + mr.Map = `function() { + var date = Date.UTC(this.date.getUTCFullYear(), + this.date.getUTCMonth(), + this.date.getUTCDate()); + emit({date: date, session: this.session}, 1); + }` + mr.Reduce = reduce + err := m.update(&mr, bson.M{"date": bson.M{"$gte": start}}, statsColl, DAILY_VISITS_COLL+"_raw") + if err != nil { + return err + } + var mr2 mgo.MapReduce + mr2.Map = `function() { + emit(this['_id']['date'], 1); + }` + mr2.Reduce = reduce + daily_raw := m.database.C(DAILY_VISITS_COLL + "_raw") + return m.update(&mr2, bson.M{}, daily_raw, DAILY_VISITS_COLL) +} + +func (m *MR) UpdateMonthVisits(statsColl *mgo.Collection) error { + const numDays = 365 + + start := time.Now().UTC().Add(-numDays * 24 * time.Hour).Truncate(24 * time.Hour) + + const reduce = `function(date, vals) { + var count = 0; + vals.forEach(function(v) { count += v; }); + return count; + }` + var mr mgo.MapReduce + mr.Map = `function() { + var date = Date.UTC(this.date.getUTCFullYear(), + this.date.getUTCMonth()); + emit({date: date, session: this.session}, 1); + }` + mr.Reduce = reduce + err := m.update(&mr, bson.M{"date": bson.M{"$gte": start}}, statsColl, MONTHLY_VISITS_COLL+"_raw") + if err != nil { + return err + } + var mr2 mgo.MapReduce + mr2.Map = `function() { + emit(this['_id']['date'], 1); + }` + mr2.Reduce = reduce + monthly_raw := m.database.C(MONTHLY_VISITS_COLL + "_raw") + return m.update(&mr2, bson.M{}, monthly_raw, MONTHLY_VISITS_COLL) +} + +func (m *MR) UpdateHourDownloads(statsColl *mgo.Collection) error { + const numDays = 2 + start := time.Now().UTC().Add(-numDays * 24 * time.Hour) + + var mr mgo.MapReduce + mr.Map = `function() { + if (this.section == "download") { + var date = Date.UTC(this.date.getUTCFullYear(), + this.date.getUTCMonth(), + this.date.getUTCDate(), + this.date.getUTCHours()); + emit(date, 1); + } + }` + mr.Reduce = `function(date, vals) { + var count = 0; + vals.forEach(function(v) { count += v; }); + return count; + }` + return m.update(&mr, bson.M{"date": bson.M{"$gte": start}}, statsColl, HOURLY_DOWNLOADS_COLL) +} + +func (m *MR) UpdateDayDownloads(statsColl *mgo.Collection) error { + const numDays = 30 + start := time.Now().UTC().Add(-numDays * 24 * time.Hour).Truncate(24 * time.Hour) + + var mr mgo.MapReduce + mr.Map = `function() { + if (this.section == "download") { + var date = Date.UTC(this.date.getUTCFullYear(), + this.date.getUTCMonth(), + this.date.getUTCDate()); + emit(date, 1); + } + }` + mr.Reduce = `function(date, vals) { + var count = 0; + vals.forEach(function(v) { count += v; }); + return count; + }` + return m.update(&mr, bson.M{"date": bson.M{"$gte": start}}, statsColl, DAILY_DOWNLOADS_COLL) +} + +func (m *MR) UpdateMonthDownloads(statsColl *mgo.Collection) error { + const numDays = 365 + + start := time.Now().UTC().Add(-numDays * 24 * time.Hour).Truncate(24 * time.Hour) + + var mr mgo.MapReduce + mr.Map = `function() { + if (this.section == "download") { + var date = Date.UTC(this.date.getUTCFullYear(), + this.date.getUTCMonth()); + emit(date, 1); + } + }` + mr.Reduce = `function(date, vals) { + var count = 0; + vals.forEach(function(v) { count += v; }); + return count; + }` + return m.update(&mr, bson.M{"date": bson.M{"$gte": start}}, statsColl, MONTHLY_DOWNLOADS_COLL) +} + +func (m *MR) update(mr *mgo.MapReduce, query bson.M, queryColl *mgo.Collection, storeColl string) error { + metaColl := m.database.C(META_COLL) + _, err := metaColl.RemoveAll(bson.M{"type": storeColl}) + if err != nil { + return err + } + + mr.Out = bson.M{"replace": storeColl} + _, err = queryColl.Find(query).MapReduce(mr, nil) + if err != nil { + return err + } + + return metaColl.Insert(bson.M{"type": storeColl}) +} + +func (m *MR) isOutdated(coll string, minutes float64) bool { + var result struct { + Id bson.ObjectId `bson:"_id"` + } + metaColl := m.database.C(META_COLL) + err := metaColl.Find(bson.M{"type": coll}).One(&result) + if err != nil { + return true + } + + lastUpdate := result.Id.Time() + return time.Since(lastUpdate).Minutes() > minutes +} diff --git a/tools/importer/search.go b/tools/importer/search.go new file mode 100644 index 0000000..9f94543 --- /dev/null +++ b/tools/importer/search.go @@ -0,0 +1,85 @@ +package main + +import ( + "labix.org/v2/mgo/bson" + "net/http" + "strconv" + "strings" +) + +func buildQuery(q string) bson.M { + var reg []bson.RegEx + query := bson.M{"active": true} + words := strings.Split(q, " ") + for _, w := range words { + tag := strings.SplitN(w, ":", 2) + if len(tag) > 1 { + query[tag[0]] = bson.RegEx{tag[1], "i"} + } else { + reg = append(reg, bson.RegEx{w, "i"}) + } + } + if len(reg) > 0 { + query["keywords"] = bson.M{"$all": reg} + } + return query +} + +type searchData struct { + S Status + Found int + Books []Book + ItemsPage int + Page int + Next string + Prev string +} + +func searchHandler(h handler) { + err := h.r.ParseForm() + if err != nil { + http.Error(h.w, err.Error(), http.StatusInternalServerError) + return + } + req := strings.Join(h.r.Form["q"], " ") + page := 0 + if len(h.r.Form["p"]) != 0 { + page, err = strconv.Atoi(h.r.Form["p"][0]) + if err != nil { + page = 0 + } + } + items_page := itemsPage(h.r) + res, num, _ := h.db.GetBooks(buildQuery(req), items_page, page*items_page) + + var data searchData + data.S = GetStatus(h) + data.S.Search = req + data.Books = res + data.ItemsPage = items_page + data.Found = num + data.Page = page + 1 + if num > (page+1)*items_page { + data.Next = "/search/?q=" + req + "&p=" + strconv.Itoa(page+1) + "&num=" + strconv.Itoa(items_page) + } + if page > 0 { + data.Prev = "/search/?q=" + req + "&p=" + strconv.Itoa(page-1) + "&num=" + strconv.Itoa(items_page) + } + + format := h.r.Form["fmt"] + if (len(format) > 0) && (format[0] == "rss") { + loadTxtTemplate(h.w, "search_rss.xml", data) + } else { + loadTemplate(h.w, "search", data) + } +} + +func itemsPage(r *http.Request) int { + if len(r.Form["num"]) > 0 { + items_page, err := strconv.Atoi(r.Form["num"][0]) + if err == nil { + return items_page + } + } + return SEARCH_ITEMS_PAGE +} diff --git a/tools/importer/session.go b/tools/importer/session.go new file mode 100644 index 0000000..e958cdc --- /dev/null +++ b/tools/importer/session.go @@ -0,0 +1,81 @@ +package main + +import ( + "encoding/hex" + "github.com/gorilla/securecookie" + "github.com/gorilla/sessions" + "net/http" +) + +var sesStore = sessions.NewCookieStore(securecookie.GenerateRandomKey(64)) + +type Notification struct { + Title string + Msg string + Type string /* error, info or success */ +} + +type Session struct { + User string + Role string + S *sessions.Session +} + +func GetSession(r *http.Request, db *DB) (s *Session) { + s = new(Session) + var err error + s.S, err = sesStore.Get(r, "session") + if err == nil && !s.S.IsNew { + s.User, _ = s.S.Values["user"].(string) + s.Role = db.UserRole(s.User) + } + + if s.S.IsNew { + s.S.Values["id"] = hex.EncodeToString(securecookie.GenerateRandomKey(16)) + } + + return +} + +func (s *Session) GetNotif() []Notification { + session := s.S + msgs := session.Flashes("nMsg") + titles := session.Flashes("nTitle") + tpes := session.Flashes("nType") + notif := make([]Notification, len(msgs)) + for i, m := range msgs { + msg, _ := m.(string) + title, _ := titles[i].(string) + tpe, _ := tpes[i].(string) + notif[i] = Notification{title, msg, tpe} + } + return notif +} + +func (s *Session) LogIn(user string) { + s.User = user + s.S.Values["user"] = user +} + +func (s *Session) LogOut() { + s.S.Values["user"] = "" +} + +func (s *Session) Notify(title, msg, tpe string) { + s.S.AddFlash(msg, "nMsg") + s.S.AddFlash(title, "nTitle") + s.S.AddFlash(tpe, "nType") +} + +func (s *Session) Save(w http.ResponseWriter, r *http.Request) { + sesStore.Save(r, w, s.S) +} + +func (s *Session) Id() string { + id, _ := s.S.Values["id"].(string) + return id +} + +func (s *Session) IsAdmin() bool { + return s.Role == "admin" +} diff --git a/tools/importer/stats.go b/tools/importer/stats.go new file mode 100644 index 0000000..4218ddf --- /dev/null +++ b/tools/importer/stats.go @@ -0,0 +1,244 @@ +package main + +import log "github.com/cihub/seelog" + +import ( + "github.com/gorilla/mux" + "labix.org/v2/mgo/bson" + "net/http" + "strconv" + "strings" + "time" +) + +type handler struct { + w http.ResponseWriter + r *http.Request + sess *Session + db *DB +} + +func InitStats(database *DB) { + statsChannel = make(chan statsRequest, CHAN_SIZE) + go statsWorker(database) +} + +func GatherStats(function func(handler), database *DB) func(http.ResponseWriter, *http.Request) { + return func(w http.ResponseWriter, r *http.Request) { + log.Info("Query ", r.Method, " ", r.RequestURI) + + var h handler + h.db = database.Copy() + defer h.db.Close() + + h.w = w + h.r = r + h.sess = GetSession(r, h.db) + function(h) + + statsChannel <- statsRequest{bson.Now(), mux.Vars(r), h.sess, r} + } +} + +var statsChannel chan statsRequest + +type statsRequest struct { + date time.Time + vars map[string]string + sess *Session + r *http.Request +} + +func statsWorker(database *DB) { + db := database.Copy() + defer db.Close() + + for req := range statsChannel { + stats := make(map[string]interface{}) + appendFiles(req.r, stats) + appendMuxVars(req.vars, stats) + appendUrl(req.r, stats) + appendSession(req.sess, stats) + stats["method"] = req.r.Method + stats["date"] = req.date + db.InsertStats(stats) + } +} + +func statsHandler(h handler) { + var data statsData + data.S = GetStatus(h) + data.S.Stats = true + data.HVisits = getHourlyVisits(h.db) + data.DVisits = getDailyVisits(h.db) + data.MVisits = getMonthlyVisits(h.db) + data.HDownloads = getHourlyDownloads(h.db) + data.DDownloads = getDailyDownloads(h.db) + data.MDownloads = getMonthlyDownloads(h.db) + + loadTemplate(h.w, "stats", data) +} + +type statsData struct { + S Status + HVisits []visitData + DVisits []visitData + MVisits []visitData + HDownloads []visitData + DDownloads []visitData + MDownloads []visitData +} + +type visitData struct { + Label string + Count int +} + +func getHourlyVisits(db *DB) []visitData { + var visits []visitData + + visit, _ := db.GetHourVisits() + for _, v := range visit { + var elem visitData + hour := time.Unix(v.Date/1000, 0).UTC().Hour() + elem.Label = strconv.Itoa(hour + 1) + elem.Count = v.Count + visits = append(visits, elem) + } + + return visits +} + +func getDailyVisits(db *DB) []visitData { + var visits []visitData + + visit, _ := db.GetDayVisits() + for _, v := range visit { + var elem visitData + day := time.Unix(v.Date/1000, 0).UTC().Day() + elem.Label = strconv.Itoa(day) + elem.Count = v.Count + visits = append(visits, elem) + } + + return visits +} + +func getMonthlyVisits(db *DB) []visitData { + var visits []visitData + + visit, _ := db.GetMonthVisits() + for _, v := range visit { + var elem visitData + month := time.Unix(v.Date/1000, 0).UTC().Month() + elem.Label = month.String() + elem.Count = v.Count + visits = append(visits, elem) + } + + return visits +} + +func getHourlyDownloads(db *DB) []visitData { + var visits []visitData + + visit, _ := db.GetHourDownloads() + for _, v := range visit { + var elem visitData + hour := time.Unix(v.Date/1000, 0).UTC().Hour() + elem.Label = strconv.Itoa(hour + 1) + elem.Count = v.Count + visits = append(visits, elem) + } + + return visits +} + +func getDailyDownloads(db *DB) []visitData { + var visits []visitData + + visit, _ := db.GetDayDownloads() + for _, v := range visit { + var elem visitData + day := time.Unix(v.Date/1000, 0).UTC().Day() + elem.Label = strconv.Itoa(day) + elem.Count = v.Count + visits = append(visits, elem) + } + + return visits +} + +func getMonthlyDownloads(db *DB) []visitData { + var visits []visitData + + visit, _ := db.GetMonthDownloads() + for _, v := range visit { + var elem visitData + month := time.Unix(v.Date/1000, 0).UTC().Month() + elem.Label = month.String() + elem.Count = v.Count + visits = append(visits, elem) + } + + return visits +} + +func appendFiles(r *http.Request, stats map[string]interface{}) { + if r.Method == "POST" && r.MultipartForm != nil { + files := r.MultipartForm.File + for key := range files { + list := make([]string, len(files[key])) + for i, f := range files[key] { + list[i] = f.Filename + } + stats[key] = list + } + } +} + +func appendMuxVars(vars map[string]string, stats map[string]interface{}) { + for key, value := range vars { + switch { + case key == "id": + if bson.IsObjectIdHex(value) { + stats["id"] = bson.ObjectIdHex(value) + } + case key == "ids": + var objectIds []bson.ObjectId + ids := strings.Split(value, "/") + for _, id := range ids { + if bson.IsObjectIdHex(value) { + objectIds = append(objectIds, bson.ObjectIdHex(id)) + } + } + if len(objectIds) > 0 { + stats["ids"] = objectIds + stats["id"] = objectIds[0] + } + default: + stats[key] = value + } + } +} + +func appendUrl(r *http.Request, stats map[string]interface{}) { + for key, value := range r.URL.Query() { + stats[key] = value + } + stats["host"] = r.Host + stats["path"] = r.URL.Path + pattern := strings.Split(r.URL.Path, "/") + if len(pattern) > 1 && pattern[1] != "" { + stats["section"] = pattern[1] + } else { + stats["section"] = "/" + } +} + +func appendSession(sess *Session, stats map[string]interface{}) { + stats["session"] = sess.Id() + if sess.User != "" { + stats["user"] = sess.User + } +} diff --git a/tools/importer/store.go b/tools/importer/store.go new file mode 100644 index 0000000..5b0ee8c --- /dev/null +++ b/tools/importer/store.go @@ -0,0 +1,128 @@ +package main + +import ( + "bytes" + "git.gitorious.org/go-pkg/epubgo.git" + "io" + "io/ioutil" + "labix.org/v2/mgo/bson" + "regexp" + "strings" +) + +func OpenBook(id bson.ObjectId, db *DB) (*epubgo.Epub, error) { + fs := db.GetFS(FS_BOOKS) + f, err := fs.OpenId(id) + if err != nil { + return nil, err + } + defer f.Close() + + buff, err := ioutil.ReadAll(f) + reader := bytes.NewReader(buff) + + return epubgo.Load(reader, int64(len(buff))) +} + +func StoreNewFile(name string, file io.Reader, db *DB) (bson.ObjectId, int64, error) { + fs := db.GetFS(FS_BOOKS) + fw, err := fs.Create(name) + if err != nil { + return "", 0, err + } + defer fw.Close() + + size, err := io.Copy(fw, file) + id, _ := fw.Id().(bson.ObjectId) + return id, size, err +} + +func DeleteFile(id bson.ObjectId, db *DB) error { + fs := db.GetFS(FS_BOOKS) + return fs.RemoveId(id) +} + +func DeleteCover(id bson.ObjectId, db *DB) error { + fs := db.GetFS(FS_IMGS) + return fs.RemoveId(id) +} + +func DeleteBook(book Book, db *DB) { + if book.Cover != "" { + DeleteCover(book.Cover, db) + } + if book.CoverSmall != "" { + DeleteCover(book.CoverSmall, db) + } + DeleteFile(book.File, db) +} + +func cleanStr(str string) string { + str = strings.Replace(str, "'", "'", -1) + exp, _ := regexp.Compile("&[^;]*;") + str = exp.ReplaceAllString(str, "") + exp, _ = regexp.Compile("[ ,]*$") + str = exp.ReplaceAllString(str, "") + return str +} + +func parseAuthr(creator []string) []string { + exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$") + exp2, _ := regexp.Compile("^[^:]*: *(.*)$") + res := make([]string, len(creator)) + for i, s := range creator { + auth := exp1.FindStringSubmatch(s) + if auth != nil { + res[i] = cleanStr(strings.Join(auth[2:], ", ")) + } else { + auth := exp2.FindStringSubmatch(s) + if auth != nil { + res[i] = cleanStr(auth[1]) + } else { + res[i] = cleanStr(s) + } + } + } + return res +} + +func parseDescription(description []string) string { + str := cleanStr(strings.Join(description, "\n")) + str = strings.Replace(str, "

", "\n", -1) + exp, _ := regexp.Compile("<[^>]*>") + str = exp.ReplaceAllString(str, "") + str = strings.Replace(str, "&", "&", -1) + str = strings.Replace(str, "<", "<", -1) + str = strings.Replace(str, ">", ">", -1) + str = strings.Replace(str, "\\n", "\n", -1) + return str +} + +func parseSubject(subject []string) []string { + var res []string + for _, s := range subject { + res = append(res, strings.Split(s, " / ")...) + } + return res +} + +func parseDate(date []string) string { + if len(date) == 0 { + return "" + } + return strings.Replace(date[0], "Unspecified: ", "", -1) +} + +func keywords(b map[string]interface{}) (k []string) { + title, _ := b["title"].(string) + k = strings.Split(title, " ") + author, _ := b["author"].([]string) + for _, a := range author { + k = append(k, strings.Split(a, " ")...) + } + publisher, _ := b["publisher"].(string) + k = append(k, strings.Split(publisher, " ")...) + subject, _ := b["subject"].([]string) + k = append(k, subject...) + return +} diff --git a/tools/importer/upload.go b/tools/importer/upload.go new file mode 100644 index 0000000..8f05f0a --- /dev/null +++ b/tools/importer/upload.go @@ -0,0 +1,146 @@ +package main + +import log "github.com/cihub/seelog" + +import ( + "bytes" + "git.gitorious.org/go-pkg/epubgo.git" + "io/ioutil" + "mime/multipart" + "strings" +) + +func InitUpload(database *DB) { + uploadChannel = make(chan uploadRequest, CHAN_SIZE) + go uploadWorker(database) +} + +var uploadChannel chan uploadRequest + +type uploadRequest struct { + file multipart.File + filename string +} + +func uploadWorker(database *DB) { + db := database.Copy() + defer db.Close() + + for req := range uploadChannel { + processFile(req, db) + } +} + +func processFile(req uploadRequest, db *DB) { + defer req.file.Close() + + epub, err := openMultipartEpub(req.file) + if err != nil { + log.Warn("Not valid epub uploaded file ", req.filename, ": ", err) + return + } + defer epub.Close() + + book := parseFile(epub, db) + title, _ := book["title"].(string) + req.file.Seek(0, 0) + id, size, err := StoreNewFile(title+".epub", req.file, db) + if err != nil { + log.Error("Error storing book (", title, "): ", err) + return + } + + book["file"] = id + book["filesize"] = size + err = db.InsertBook(book) + if err != nil { + log.Error("Error storing metadata (", title, "): ", err) + return + } + log.Info("File uploaded: ", req.filename) +} + +func uploadPostHandler(h handler) { + problem := false + + h.r.ParseMultipartForm(20000000) + filesForm := h.r.MultipartForm.File["epub"] + for _, f := range filesForm { + file, err := f.Open() + if err != nil { + log.Error("Can not open uploaded file ", f.Filename, ": ", err) + h.sess.Notify("Upload problem!", "There was a problem with book "+f.Filename, "error") + problem = true + continue + } + uploadChannel <- uploadRequest{file, f.Filename} + } + + if !problem { + if len(filesForm) > 0 { + h.sess.Notify("Upload successful!", "Thank you for your contribution", "success") + } else { + h.sess.Notify("Upload problem!", "No books where uploaded.", "error") + } + } + uploadHandler(h) +} + +func uploadHandler(h handler) { + var data uploadData + data.S = GetStatus(h) + data.S.Upload = true + loadTemplate(h.w, "upload", data) +} + +type uploadData struct { + S Status +} + +func openMultipartEpub(file multipart.File) (*epubgo.Epub, error) { + buff, _ := ioutil.ReadAll(file) + reader := bytes.NewReader(buff) + return epubgo.Load(reader, int64(len(buff))) +} + +func parseFile(epub *epubgo.Epub, db *DB) map[string]interface{} { + book := map[string]interface{}{} + for _, m := range epub.MetadataFields() { + data, err := epub.Metadata(m) + if err != nil { + continue + } + switch m { + case "creator": + book["author"] = parseAuthr(data) + case "description": + book[m] = parseDescription(data) + case "subject": + book[m] = parseSubject(data) + case "date": + book[m] = parseDate(data) + case "language": + book["lang"] = data + case "title", "contributor", "publisher": + book[m] = cleanStr(strings.Join(data, ", ")) + case "identifier": + attr, _ := epub.MetadataAttr(m) + for i, d := range data { + if attr[i]["scheme"] == "ISBN" { + book["isbn"] = d + } + } + default: + book[m] = strings.Join(data, ", ") + } + } + title, _ := book["title"].(string) + book["file"] = nil + cover, coverSmall := GetCover(epub, title, db) + if cover != "" { + book["cover"] = cover + book["coversmall"] = coverSmall + } + book["keywords"] = keywords(book) + return book +}