snowflake/common/amp/armor_decoder.go
David Fifield c9e0dd287f amp package.
This package contains a CacheURL function that modifies a URL to be
accessed through an AMP cache, and the "AMP armor" data encoding scheme
for encoding data into the AMP subset of HTML.
2021-08-05 16:13:24 -06:00

136 lines
3.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package amp
import (
"bufio"
"bytes"
"encoding/base64"
"fmt"
"io"
"golang.org/x/net/html"
)
// ErrUnknownVersion is the error returned when the first character inside the
// element encoding (but outside the base64 encoding) is not '0'.
type ErrUnknownVersion byte
func (err ErrUnknownVersion) Error() string {
return fmt.Sprintf("unknown armor version indicator %+q", byte(err))
}
func isASCIIWhitespace(b byte) bool {
switch b {
// https://infra.spec.whatwg.org/#ascii-whitespace
case '\x09', '\x0a', '\x0c', '\x0d', '\x20':
return true
default:
return false
}
}
func splitASCIIWhitespace(data []byte, atEOF bool) (advance int, token []byte, err error) {
var i, j int
// Skip initial whitespace.
for i = 0; i < len(data); i++ {
if !isASCIIWhitespace(data[i]) {
break
}
}
// Look for next whitespace.
for j = i; j < len(data); j++ {
if isASCIIWhitespace(data[j]) {
return j + 1, data[i:j], nil
}
}
// We reached the end of data without finding more whitespace. Only
// consider it a token if we are at EOF.
if atEOF && i < j {
return j, data[i:j], nil
}
// Otherwise, request more data.
return i, nil, nil
}
func decodeToWriter(w io.Writer, r io.Reader) (int64, error) {
tokenizer := html.NewTokenizer(r)
// Set a memory limit on token sizes, otherwise the tokenizer will
// buffer text indefinitely if it is not broken up by other token types.
tokenizer.SetMaxBuf(elementSizeLimit)
active := false
total := int64(0)
for {
tt := tokenizer.Next()
switch tt {
case html.ErrorToken:
err := tokenizer.Err()
if err == io.EOF {
err = nil
}
if err == nil && active {
return total, fmt.Errorf("missing </pre> tag")
}
return total, err
case html.TextToken:
if active {
// Re-join the separate chunks of text and
// feed them to the decoder.
scanner := bufio.NewScanner(bytes.NewReader(tokenizer.Text()))
scanner.Split(splitASCIIWhitespace)
for scanner.Scan() {
n, err := w.Write(scanner.Bytes())
total += int64(n)
if err != nil {
return total, err
}
}
if err := scanner.Err(); err != nil {
return total, err
}
}
case html.StartTagToken:
tn, _ := tokenizer.TagName()
if string(tn) == "pre" {
if active {
// nesting not allowed
return total, fmt.Errorf("unexpected %s", tokenizer.Token())
}
active = true
}
case html.EndTagToken:
tn, _ := tokenizer.TagName()
if string(tn) == "pre" {
if !active {
// stray end tag
return total, fmt.Errorf("unexpected %s", tokenizer.Token())
}
active = false
}
}
}
}
// NewArmorDecoder returns a new AMP armor decoder.
func NewArmorDecoder(r io.Reader) (io.Reader, error) {
pr, pw := io.Pipe()
go func() {
_, err := decodeToWriter(pw, r)
pw.CloseWithError(err)
}()
// The first byte inside the element encoding is a serverclient
// protocol version indicator.
var version [1]byte
_, err := pr.Read(version[:])
if err != nil {
pr.CloseWithError(err)
return nil, err
}
switch version[0] {
case '0':
return base64.NewDecoder(base64.StdEncoding, pr), nil
default:
err := ErrUnknownVersion(version[0])
pr.CloseWithError(err)
return nil, err
}
}