mirror of
https://gitlab.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake.git
synced 2025-10-14 05:11:19 -04:00
This package contains a CacheURL function that modifies a URL to be accessed through an AMP cache, and the "AMP armor" data encoding scheme for encoding data into the AMP subset of HTML.
136 lines
3.2 KiB
Go
136 lines
3.2 KiB
Go
package amp
|
||
|
||
import (
|
||
"bufio"
|
||
"bytes"
|
||
"encoding/base64"
|
||
"fmt"
|
||
"io"
|
||
|
||
"golang.org/x/net/html"
|
||
)
|
||
|
||
// ErrUnknownVersion is the error returned when the first character inside the
|
||
// element encoding (but outside the base64 encoding) is not '0'.
|
||
type ErrUnknownVersion byte
|
||
|
||
func (err ErrUnknownVersion) Error() string {
|
||
return fmt.Sprintf("unknown armor version indicator %+q", byte(err))
|
||
}
|
||
|
||
func isASCIIWhitespace(b byte) bool {
|
||
switch b {
|
||
// https://infra.spec.whatwg.org/#ascii-whitespace
|
||
case '\x09', '\x0a', '\x0c', '\x0d', '\x20':
|
||
return true
|
||
default:
|
||
return false
|
||
}
|
||
}
|
||
|
||
func splitASCIIWhitespace(data []byte, atEOF bool) (advance int, token []byte, err error) {
|
||
var i, j int
|
||
// Skip initial whitespace.
|
||
for i = 0; i < len(data); i++ {
|
||
if !isASCIIWhitespace(data[i]) {
|
||
break
|
||
}
|
||
}
|
||
// Look for next whitespace.
|
||
for j = i; j < len(data); j++ {
|
||
if isASCIIWhitespace(data[j]) {
|
||
return j + 1, data[i:j], nil
|
||
}
|
||
}
|
||
// We reached the end of data without finding more whitespace. Only
|
||
// consider it a token if we are at EOF.
|
||
if atEOF && i < j {
|
||
return j, data[i:j], nil
|
||
}
|
||
// Otherwise, request more data.
|
||
return i, nil, nil
|
||
}
|
||
|
||
func decodeToWriter(w io.Writer, r io.Reader) (int64, error) {
|
||
tokenizer := html.NewTokenizer(r)
|
||
// Set a memory limit on token sizes, otherwise the tokenizer will
|
||
// buffer text indefinitely if it is not broken up by other token types.
|
||
tokenizer.SetMaxBuf(elementSizeLimit)
|
||
active := false
|
||
total := int64(0)
|
||
for {
|
||
tt := tokenizer.Next()
|
||
switch tt {
|
||
case html.ErrorToken:
|
||
err := tokenizer.Err()
|
||
if err == io.EOF {
|
||
err = nil
|
||
}
|
||
if err == nil && active {
|
||
return total, fmt.Errorf("missing </pre> tag")
|
||
}
|
||
return total, err
|
||
case html.TextToken:
|
||
if active {
|
||
// Re-join the separate chunks of text and
|
||
// feed them to the decoder.
|
||
scanner := bufio.NewScanner(bytes.NewReader(tokenizer.Text()))
|
||
scanner.Split(splitASCIIWhitespace)
|
||
for scanner.Scan() {
|
||
n, err := w.Write(scanner.Bytes())
|
||
total += int64(n)
|
||
if err != nil {
|
||
return total, err
|
||
}
|
||
}
|
||
if err := scanner.Err(); err != nil {
|
||
return total, err
|
||
}
|
||
}
|
||
case html.StartTagToken:
|
||
tn, _ := tokenizer.TagName()
|
||
if string(tn) == "pre" {
|
||
if active {
|
||
// nesting not allowed
|
||
return total, fmt.Errorf("unexpected %s", tokenizer.Token())
|
||
}
|
||
active = true
|
||
}
|
||
case html.EndTagToken:
|
||
tn, _ := tokenizer.TagName()
|
||
if string(tn) == "pre" {
|
||
if !active {
|
||
// stray end tag
|
||
return total, fmt.Errorf("unexpected %s", tokenizer.Token())
|
||
}
|
||
active = false
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// NewArmorDecoder returns a new AMP armor decoder.
|
||
func NewArmorDecoder(r io.Reader) (io.Reader, error) {
|
||
pr, pw := io.Pipe()
|
||
go func() {
|
||
_, err := decodeToWriter(pw, r)
|
||
pw.CloseWithError(err)
|
||
}()
|
||
|
||
// The first byte inside the element encoding is a server–client
|
||
// protocol version indicator.
|
||
var version [1]byte
|
||
_, err := pr.Read(version[:])
|
||
if err != nil {
|
||
pr.CloseWithError(err)
|
||
return nil, err
|
||
}
|
||
switch version[0] {
|
||
case '0':
|
||
return base64.NewDecoder(base64.StdEncoding, pr), nil
|
||
default:
|
||
err := ErrUnknownVersion(version[0])
|
||
pr.CloseWithError(err)
|
||
return nil, err
|
||
}
|
||
}
|