Move all the code to a lib folder

This commit is contained in:
Las Zenow 2016-05-02 21:36:49 -04:00
parent e963d00014
commit 9d1f1ad5c0
31 changed files with 123 additions and 98 deletions

81
lib/parser/isbn.go Normal file
View file

@ -0,0 +1,81 @@
package parser
import (
"strings"
"unicode"
)
func ISBN(orig string) string {
isbn := getISBN(orig)
if len(isbn) != 13 && len(isbn) != 10 {
return ""
}
if !validChecksum(isbn) {
return ""
}
return toISBN13(isbn)
}
func getISBN(src string) string {
isbn := strings.ToUpper(src)
isNotNumber := func(r rune) bool {
return !unicode.IsNumber(r)
}
isNotNumberOrX := func(r rune) bool {
return !unicode.IsNumber(r) && r != 'X'
}
isbn = strings.TrimLeftFunc(isbn, isNotNumber)
isbn = strings.TrimRightFunc(isbn, isNotNumberOrX)
isbn = strings.Replace(isbn, "-", "", -1)
isbn = strings.Replace(isbn, " ", "", -1)
if len(isbn) > 13 {
isbn = isbn[:13]
}
return isbn
}
func validChecksum(isbn string) bool {
if len(isbn) == 10 {
return rune(isbn[9]) == checkDigit10(isbn)
}
return rune(isbn[12]) == checkDigit13(isbn)
}
func toISBN13(isbn string) string {
if len(isbn) == 13 {
return isbn
}
isbn = "978" + isbn
return isbn[:12] + string(checkDigit13(isbn))
}
func checkDigit10(isbn string) rune {
acc := 0
for i, r := range isbn[:9] {
acc += (10 - i) * int(r-'0')
}
check := (11 - (acc % 11)) % 11
if check == 10 {
return 'X'
}
return rune(check + '0')
}
func checkDigit13(isbn string) rune {
acc := 0
for i, r := range isbn[:12] {
n := int(r - '0')
if i%2 == 1 {
n = 3 * n
}
acc += n
}
check := (10 - (acc % 10)) % 10
return rune(check + '0')
}

26
lib/parser/isbn_test.go Normal file
View file

@ -0,0 +1,26 @@
package parser
import "testing"
func TestISBN(t *testing.T) {
isbn_test := [][]string{
[]string{"", ""},
[]string{"978074341", ""},
[]string{"9780743412395", ""},
[]string{"9780743412391", "9780743412391"},
[]string{"0-688-12189-6", "9780688121891"},
[]string{"033026155X", "9780330261555"},
[]string{"033026155x", "9780330261555"},
[]string{"0307756432", "9780307756435"},
[]string{"urn:isbn:978-3-8387-0337-4:", "9783838703374"},
[]string{"EPUB9788865971468-113465", "9788865971468"},
}
for _, isbn := range isbn_test {
src := isbn[0]
dst := isbn[1]
if res := ISBN(src); res != dst {
t.Error("ISBN parse failed: ", src, " => ", res, " (expected ", dst, ")")
}
}
}

66
lib/parser/language.go Normal file
View file

@ -0,0 +1,66 @@
package parser
import (
"io/ioutil"
"strings"
"github.com/meskio/epubgo"
"github.com/rainycape/cld2"
)
func GuessLang(epub *epubgo.Epub, orig_langs []string) []string {
spine, err := epub.Spine()
if err != nil {
return orig_langs
}
var err_spine error
err_spine = nil
langs := []string{}
for err_spine == nil {
html, err := spine.Open()
err_spine = spine.Next()
if err != nil {
continue
}
defer html.Close()
buff, err := ioutil.ReadAll(html)
if err != nil {
continue
}
langs = append(langs, cld2.Detect(string(buff)))
}
lang := commonLang(langs)
if lang != "un" && differentLang(lang, orig_langs) {
return []string{lang}
}
return orig_langs
}
func commonLang(langs []string) string {
count := map[string]int{}
for _, l := range langs {
count[l]++
}
lang := "un"
maxcount := 0
for l, c := range count {
if c > maxcount && l != "un" {
lang = l
maxcount = c
}
}
return lang
}
func differentLang(lang string, orig_langs []string) bool {
orig_lang := "un"
if len(orig_langs) > 0 && len(orig_langs) >= 2 {
orig_lang = strings.ToLower(orig_langs[0][0:2])
}
return orig_lang != lang
}

103
lib/parser/parser.go Normal file
View file

@ -0,0 +1,103 @@
package parser
import (
"regexp"
"strings"
"github.com/meskio/epubgo"
)
type MetaData map[string]interface{}
func EpubMetadata(epub *epubgo.Epub) MetaData {
metadata := MetaData{}
for _, m := range epub.MetadataFields() {
data, err := epub.Metadata(m)
if err != nil {
continue
}
switch m {
case "creator":
metadata["author"] = parseAuthr(data)
case "description":
metadata[m] = parseDescription(data)
case "subject":
metadata[m] = parseSubject(data)
case "date":
metadata[m] = parseDate(data)
case "language":
metadata["lang"] = GuessLang(epub, data)
case "title", "contributor", "publisher":
metadata[m] = cleanStr(strings.Join(data, ", "))
case "identifier":
attr, _ := epub.MetadataAttr(m)
for i, d := range data {
if attr[i]["scheme"] == "ISBN" {
isbn := ISBN(d)
if isbn != "" {
metadata["isbn"] = isbn
}
}
}
default:
metadata[m] = strings.Join(data, ", ")
}
}
return metadata
}
func cleanStr(str string) string {
str = strings.Replace(str, "'", "'", -1)
exp, _ := regexp.Compile("&[^;]*;")
str = exp.ReplaceAllString(str, "")
exp, _ = regexp.Compile("[ ,]*$")
str = exp.ReplaceAllString(str, "")
return str
}
func parseAuthr(creator []string) []string {
exp1, _ := regexp.Compile("^(.*\\( *([^\\)]*) *\\))*$")
exp2, _ := regexp.Compile("^[^:]*: *(.*)$")
res := make([]string, len(creator))
for i, s := range creator {
auth := exp1.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(strings.Join(auth[2:], ", "))
} else {
auth := exp2.FindStringSubmatch(s)
if auth != nil {
res[i] = cleanStr(auth[1])
} else {
res[i] = cleanStr(s)
}
}
}
return res
}
func parseDescription(description []string) string {
str := cleanStr(strings.Join(description, "\n"))
str = strings.Replace(str, "</p>", "\n", -1)
exp, _ := regexp.Compile("<[^>]*>")
str = exp.ReplaceAllString(str, "")
str = strings.Replace(str, "&amp;", "&", -1)
str = strings.Replace(str, "&lt;", "<", -1)
str = strings.Replace(str, "&gt;", ">", -1)
str = strings.Replace(str, "\\n", "\n", -1)
return str
}
func parseSubject(subject []string) []string {
var res []string
for _, s := range subject {
res = append(res, strings.Split(s, " / ")...)
}
return res
}
func parseDate(date []string) string {
if len(date) == 0 {
return ""
}
return strings.Replace(date[0], "Unspecified: ", "", -1)
}