amp package.

This package contains a CacheURL function that modifies a URL to be
accessed through an AMP cache, and the "AMP armor" data encoding scheme
for encoding data into the AMP subset of HTML.
This commit is contained in:
David Fifield 2021-07-18 15:22:03 -06:00
parent 0f34a7778f
commit c9e0dd287f
8 changed files with 1223 additions and 0 deletions

136
common/amp/armor_decoder.go Normal file
View file

@ -0,0 +1,136 @@
package amp
import (
"bufio"
"bytes"
"encoding/base64"
"fmt"
"io"
"golang.org/x/net/html"
)
// ErrUnknownVersion is the error returned when the first character inside the
// element encoding (but outside the base64 encoding) is not '0'.
type ErrUnknownVersion byte
func (err ErrUnknownVersion) Error() string {
return fmt.Sprintf("unknown armor version indicator %+q", byte(err))
}
func isASCIIWhitespace(b byte) bool {
switch b {
// https://infra.spec.whatwg.org/#ascii-whitespace
case '\x09', '\x0a', '\x0c', '\x0d', '\x20':
return true
default:
return false
}
}
func splitASCIIWhitespace(data []byte, atEOF bool) (advance int, token []byte, err error) {
var i, j int
// Skip initial whitespace.
for i = 0; i < len(data); i++ {
if !isASCIIWhitespace(data[i]) {
break
}
}
// Look for next whitespace.
for j = i; j < len(data); j++ {
if isASCIIWhitespace(data[j]) {
return j + 1, data[i:j], nil
}
}
// We reached the end of data without finding more whitespace. Only
// consider it a token if we are at EOF.
if atEOF && i < j {
return j, data[i:j], nil
}
// Otherwise, request more data.
return i, nil, nil
}
func decodeToWriter(w io.Writer, r io.Reader) (int64, error) {
tokenizer := html.NewTokenizer(r)
// Set a memory limit on token sizes, otherwise the tokenizer will
// buffer text indefinitely if it is not broken up by other token types.
tokenizer.SetMaxBuf(elementSizeLimit)
active := false
total := int64(0)
for {
tt := tokenizer.Next()
switch tt {
case html.ErrorToken:
err := tokenizer.Err()
if err == io.EOF {
err = nil
}
if err == nil && active {
return total, fmt.Errorf("missing </pre> tag")
}
return total, err
case html.TextToken:
if active {
// Re-join the separate chunks of text and
// feed them to the decoder.
scanner := bufio.NewScanner(bytes.NewReader(tokenizer.Text()))
scanner.Split(splitASCIIWhitespace)
for scanner.Scan() {
n, err := w.Write(scanner.Bytes())
total += int64(n)
if err != nil {
return total, err
}
}
if err := scanner.Err(); err != nil {
return total, err
}
}
case html.StartTagToken:
tn, _ := tokenizer.TagName()
if string(tn) == "pre" {
if active {
// nesting not allowed
return total, fmt.Errorf("unexpected %s", tokenizer.Token())
}
active = true
}
case html.EndTagToken:
tn, _ := tokenizer.TagName()
if string(tn) == "pre" {
if !active {
// stray end tag
return total, fmt.Errorf("unexpected %s", tokenizer.Token())
}
active = false
}
}
}
}
// NewArmorDecoder returns a new AMP armor decoder.
func NewArmorDecoder(r io.Reader) (io.Reader, error) {
pr, pw := io.Pipe()
go func() {
_, err := decodeToWriter(pw, r)
pw.CloseWithError(err)
}()
// The first byte inside the element encoding is a serverclient
// protocol version indicator.
var version [1]byte
_, err := pr.Read(version[:])
if err != nil {
pr.CloseWithError(err)
return nil, err
}
switch version[0] {
case '0':
return base64.NewDecoder(base64.StdEncoding, pr), nil
default:
err := ErrUnknownVersion(version[0])
pr.CloseWithError(err)
return nil, err
}
}

176
common/amp/armor_encoder.go Normal file
View file

@ -0,0 +1,176 @@
package amp
import (
"encoding/base64"
"io"
)
// https://amp.dev/boilerplate/
// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amp-boilerplate/?format=websites
// https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/?format=websites#the-amp-html-format
const (
boilerplateStart = `<!doctype html>
<html amp>
<head>
<meta charset="utf-8">
<script async src="https://cdn.ampproject.org/v0.js"></script>
<link rel="canonical" href="#">
<meta name="viewport" content="width=device-width">
<style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript>
</head>
<body>
`
boilerplateEnd = `</body>
</html>`
)
const (
// We restrict the amount of text may go inside an HTML element, in
// order to limit the amount a decoder may have to buffer.
elementSizeLimit = 32 * 1024
// The payload is conceptually a long base64-encoded string, but we
// break the string into short chunks separated by whitespace. This is
// to protect against modification by AMP caches, which reportedly may
// truncate long words in text:
// https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985#note_2592348
bytesPerChunk = 32
// We set the number of chunks per element so as to stay under
// elementSizeLimit. Here, we assume that there is 1 byte of whitespace
// after each chunk (with an additional whitespace byte at the beginning
// of the element).
chunksPerElement = (elementSizeLimit - 1) / (bytesPerChunk + 1)
)
// The AMP armor encoder is a chain of a base64 encoder (base64.NewEncoder) and
// an HTML element encoder (elementEncoder). A top-level encoder (armorEncoder)
// coordinates these two, and handles prepending and appending the AMP
// boilerplate. armorEncoder's Write method writes data into the base64 encoder,
// where it makes its way through the chain.
// NewArmorEncoder returns a new AMP armor encoder. Anything written to the
// returned io.WriteCloser will be encoded and written to w. The caller must
// call Close to flush any partially written data and output the AMP boilerplate
// trailer.
func NewArmorEncoder(w io.Writer) (io.WriteCloser, error) {
// Immediately write the AMP boilerplate header.
_, err := w.Write([]byte(boilerplateStart))
if err != nil {
return nil, err
}
element := &elementEncoder{w: w}
// Write a serverclient protocol version indicator, outside the base64
// layer.
_, err = element.Write([]byte{'0'})
if err != nil {
return nil, err
}
base64 := base64.NewEncoder(base64.StdEncoding, element)
return &armorEncoder{
w: w,
element: element,
base64: base64,
}, nil
}
type armorEncoder struct {
base64 io.WriteCloser
element *elementEncoder
w io.Writer
}
func (enc *armorEncoder) Write(p []byte) (int, error) {
// Write into the chain base64 | element | w.
return enc.base64.Write(p)
}
func (enc *armorEncoder) Close() error {
// Close the base64 encoder first, to flush out any buffered data and
// the final padding.
err := enc.base64.Close()
if err != nil {
return err
}
// Next, close the element encoder, to close any open elements.
err = enc.element.Close()
if err != nil {
return err
}
// Finally, output the AMP boilerplate trailer.
_, err = enc.w.Write([]byte(boilerplateEnd))
if err != nil {
return err
}
return nil
}
// elementEncoder arranges written data into pre elements, with the text within
// separated into chunks. It does no HTML encoding, so data written must not
// contain any bytes that are meaningful in HTML.
type elementEncoder struct {
w io.Writer
chunkCounter int
elementCounter int
}
func (enc *elementEncoder) Write(p []byte) (n int, err error) {
total := 0
for len(p) > 0 {
if enc.elementCounter == 0 && enc.chunkCounter == 0 {
_, err := enc.w.Write([]byte("<pre>\n"))
if err != nil {
return total, err
}
}
n := bytesPerChunk - enc.chunkCounter
if n > len(p) {
n = len(p)
}
nn, err := enc.w.Write(p[:n])
if err != nil {
return total, err
}
total += nn
p = p[n:]
enc.chunkCounter += n
if enc.chunkCounter >= bytesPerChunk {
enc.chunkCounter = 0
enc.elementCounter += 1
nn, err = enc.w.Write([]byte("\n"))
if err != nil {
return total, err
}
total += nn
}
if enc.elementCounter >= chunksPerElement {
enc.elementCounter = 0
nn, err = enc.w.Write([]byte("</pre>\n"))
if err != nil {
return total, err
}
total += nn
}
}
return total, nil
}
func (enc *elementEncoder) Close() error {
var err error
if !(enc.elementCounter == 0 && enc.chunkCounter == 0) {
if enc.chunkCounter == 0 {
_, err = enc.w.Write([]byte("</pre>\n"))
} else {
_, err = enc.w.Write([]byte("\n</pre>\n"))
}
}
return err
}

227
common/amp/armor_test.go Normal file
View file

@ -0,0 +1,227 @@
package amp
import (
"crypto/rand"
"io"
"io/ioutil"
"strings"
"testing"
)
func armorDecodeToString(src string) (string, error) {
dec, err := NewArmorDecoder(strings.NewReader(src))
if err != nil {
return "", err
}
p, err := ioutil.ReadAll(dec)
return string(p), err
}
func TestArmorDecoder(t *testing.T) {
for _, test := range []struct {
input string
expectedOutput string
expectedErr bool
}{
{`
<pre>
0
</pre>
`,
"",
false,
},
{`
<pre>
0aGVsbG8gd29ybGQK
</pre>
`,
"hello world\n",
false,
},
// bad version indicator
{`
<pre>
1aGVsbG8gd29ybGQK
</pre>
`,
"",
true,
},
// text outside <pre> elements
{`
0aGVsbG8gd29ybGQK
blah blah blah
<pre>
0aGVsbG8gd29ybGQK
</pre>
0aGVsbG8gd29ybGQK
blah blah blah
`,
"hello world\n",
false,
},
{`
<pre>
0QUJDREV
GR0hJSkt
MTU5PUFF
SU1RVVld
</pre>
junk
<pre>
YWVowMTI
zNDU2Nzg
5Cg
=
</pre>
<pre>
=
</pre>
`,
"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\n",
false,
},
// no <pre> elements, hence no version indicator
{`
aGVsbG8gd29ybGQK
blah blah blah
aGVsbG8gd29ybGQK
aGVsbG8gd29ybGQK
blah blah blah
`,
"",
true,
},
// empty <pre> elements, hence no version indicator
{`
aGVsbG8gd29ybGQK
blah blah blah
<pre> </pre>
aGVsbG8gd29ybGQK
aGVsbG8gd29ybGQK<pre></pre>
blah blah blah
`,
"",
true,
},
// other elements inside <pre>
{
"blah <pre>0aGVsb<p>G8gd29</p>ybGQK</pre>",
"hello world\n",
false,
},
// HTML comment
{
"blah <!-- <pre>aGVsbG8gd29ybGQK</pre> -->",
"",
true,
},
// all kinds of ASCII whitespace
{
"blah <pre>\x200\x09aG\x0aV\x0csb\x0dG8\x20gd29ybGQK</pre>",
"hello world\n",
false,
},
// bad padding
{`
<pre>
0QUJDREV
GR0hJSkt
MTU5PUFF
SU1RVVld
</pre>
junk
<pre>
YWVowMTI
zNDU2Nzg
5Cg
=
</pre>
`,
"",
true,
},
/*
// per-chunk base64
// test disabled because Go stdlib handles this incorrectly:
// https://github.com/golang/go/issues/31626
{
"<pre>QQ==</pre><pre>Qg==</pre>",
"",
true,
},
*/
// missing </pre>
{
"blah <pre></pre><pre>0aGVsbG8gd29ybGQK",
"",
true,
},
// nested <pre>
{
"blah <pre>0aGVsb<pre>G8gd29</pre>ybGQK</pre>",
"",
true,
},
} {
output, err := armorDecodeToString(test.input)
if test.expectedErr && err == nil {
t.Errorf("%+q → (%+q, %v), expected error", test.input, output, err)
continue
}
if !test.expectedErr && err != nil {
t.Errorf("%+q → (%+q, %v), expected no error", test.input, output, err)
continue
}
if !test.expectedErr && output != test.expectedOutput {
t.Errorf("%+q → (%+q, %v), expected (%+q, %v)",
test.input, output, err, test.expectedOutput, nil)
continue
}
}
}
func armorRoundTrip(s string) (string, error) {
var encoded strings.Builder
enc, err := NewArmorEncoder(&encoded)
if err != nil {
return "", err
}
_, err = io.Copy(enc, strings.NewReader(s))
if err != nil {
return "", err
}
err = enc.Close()
if err != nil {
return "", err
}
return armorDecodeToString(encoded.String())
}
func TestArmorRoundTrip(t *testing.T) {
lengths := make([]int, 0)
// Test short strings and lengths around elementSizeLimit thresholds.
for i := 0; i < bytesPerChunk*2; i++ {
lengths = append(lengths, i)
}
for i := -10; i < +10; i++ {
lengths = append(lengths, elementSizeLimit+i)
lengths = append(lengths, 2*elementSizeLimit+i)
}
for _, n := range lengths {
buf := make([]byte, n)
rand.Read(buf)
input := string(buf)
output, err := armorRoundTrip(input)
if err != nil {
t.Errorf("length %d → error %v", n, err)
continue
}
if output != input {
t.Errorf("length %d → %+q", n, output)
continue
}
}
}

178
common/amp/cache.go Normal file
View file

@ -0,0 +1,178 @@
package amp
import (
"crypto/sha256"
"encoding/base32"
"fmt"
"net"
"net/url"
"path"
"strings"
"golang.org/x/net/idna"
)
// domainPrefixBasic does the basic domain prefix conversion. Does not do any
// IDNA mapping, such as https://www.unicode.org/reports/tr46/.
//
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#basic-algorithm
func domainPrefixBasic(domain string) (string, error) {
// 1. Punycode Decode the publisher domain.
prefix, err := idna.ToUnicode(domain)
if err != nil {
return "", err
}
// 2. Replace any "-" (hyphen) character in the output of step 1 with
// "--" (two hyphens).
prefix = strings.Replace(prefix, "-", "--", -1)
// 3. Replace any "." (dot) character in the output of step 2 with "-"
// (hyphen).
prefix = strings.Replace(prefix, ".", "-", -1)
// 4. If the output of step 3 has a "-" (hyphen) at both positions 3 and
// 4, then to the output of step 3, add a prefix of "0-" and add a
// suffix of "-0".
if len(prefix) >= 4 && prefix[2] == '-' && prefix[3] == '-' {
prefix = "0-" + prefix + "-0"
}
// 5. Punycode Encode the output of step 3.
return idna.ToASCII(prefix)
}
// Lower-case base32 without padding.
var fallbackBase32Encoding = base32.NewEncoding("abcdefghijklmnopqrstuvwxyz234567").WithPadding(base32.NoPadding)
// domainPrefixFallback does the fallback domain prefix conversion. The returned
// base32 domain uses lower-case letters.
//
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#fallback-algorithm
func domainPrefixFallback(domain string) string {
// The algorithm specification does not say what, exactly, we are to
// take the SHA-256 of. domain is notionally an abstract Unicode
// string, not a byte sequence. While
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L62
// says "Take the SHA256 of the punycode view of the domain," in reality
// it hashes the UTF-8 encoding of the domain, without Punycode:
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L141
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/browser/Sha256.js#L24
// We do the same here, hashing the raw bytes of domain, presumed to be
// UTF-8.
// 1. Hash the publisher's domain using SHA256.
h := sha256.Sum256([]byte(domain))
// 2. Base32 Escape the output of step 1.
// 3. Remove the last 4 characters from the output of step 2, which are
// always "=" (equals) characters.
return fallbackBase32Encoding.EncodeToString(h[:])
}
// domainPrefix computes the domain prefix of an AMP cache URL.
//
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#domain-name-prefix
func domainPrefix(domain string) string {
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#combined-algorithm
// 1. Run the Basic Algorithm. If the output is a valid DNS label,
// [append the Cache domain suffix and] return. Otherwise continue to
// step 2.
prefix, err := domainPrefixBasic(domain)
// "A domain prefix is not a valid DNS label if it is longer than 63
// characters"
if err == nil && len(prefix) <= 63 {
return prefix
}
// 2. Run the Fallback Algorithm. [Append the Cache domain suffix and]
// return.
return domainPrefixFallback(domain)
}
// CacheURL computes the AMP cache URL for the publisher URL pubURL, using the
// AMP cache at cacheURL. contentType is a string such as "c" or "i" that
// indicates what type of serving the AMP cache is to perform. The Scheme of
// pubURL must be "http" or "https". The Port of pubURL, if any, must match the
// default for the scheme. cacheURL may not have RawQuery, Fragment, or
// RawFragment set, because the resulting URL's query and fragment are taken
// from the publisher URL.
//
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/
func CacheURL(pubURL, cacheURL *url.URL, contentType string) (*url.URL, error) {
// The cache URL subdomain, including the domain prefix corresponding to
// the publisher URL's domain.
resultHost := domainPrefix(pubURL.Hostname()) + "." + cacheURL.Hostname()
if cacheURL.Port() != "" {
resultHost = net.JoinHostPort(resultHost, cacheURL.Port())
}
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#url-path
// The first part of the path is the cache URL's own path, if any.
pathComponents := []string{cacheURL.EscapedPath()}
// The next path component is the content type. We cannot encode an
// empty content type, because it would result in consecutive path
// separators, which would semantically combine into a single separator.
if contentType == "" {
return nil, fmt.Errorf("invalid content type %+q", contentType)
}
pathComponents = append(pathComponents, url.PathEscape(contentType))
// Then, we add an "s" path component, if the publisher URL scheme is
// "https".
switch pubURL.Scheme {
case "http":
// Do nothing.
case "https":
pathComponents = append(pathComponents, "s")
default:
return nil, fmt.Errorf("invalid scheme %+q in publisher URL", pubURL.Scheme)
}
// The next path component is the publisher URL's host. The AMP cache
// URL format specification is not clear about whether other
// subcomponents of the authority (namely userinfo and port) may appear
// here. We adopt a policy of forbidding userinfo, and requiring that
// the port be the default for the scheme (and then we omit the port
// entirely from the returned URL).
if pubURL.User != nil {
return nil, fmt.Errorf("publisher URL may not contain userinfo")
}
if port := pubURL.Port(); port != "" {
if !((pubURL.Scheme == "http" && port == "80") || (pubURL.Scheme == "https" && port == "443")) {
return nil, fmt.Errorf("publisher URL port %+q is not the default for scheme %+q", port, pubURL.Scheme)
}
}
// As with the content type, we cannot encode an empty host, because
// that would result in an empty path component.
if pubURL.Hostname() == "" {
return nil, fmt.Errorf("invalid host %+q in publisher URL", pubURL.Hostname())
}
pathComponents = append(pathComponents, url.PathEscape(pubURL.Hostname()))
// Finally, we append the remainder of the original escaped path from
// the publisher URL.
pathComponents = append(pathComponents, pubURL.EscapedPath())
resultRawPath := path.Join(pathComponents...)
resultPath, err := url.PathUnescape(resultRawPath)
if err != nil {
return nil, err
}
// The query and fragment of the returned URL always come from pubURL.
// Any query or fragment of cacheURL would be ignored. Return an error
// if either is set.
if cacheURL.RawQuery != "" {
return nil, fmt.Errorf("cache URL may not contain a query")
}
if cacheURL.Fragment != "" {
return nil, fmt.Errorf("cache URL may not contain a fragment")
}
return &url.URL{
Scheme: cacheURL.Scheme,
User: cacheURL.User,
Host: resultHost,
Path: resultPath,
RawPath: resultRawPath,
RawQuery: pubURL.RawQuery,
Fragment: pubURL.Fragment,
}, nil
}

320
common/amp/cache_test.go Normal file
View file

@ -0,0 +1,320 @@
package amp
import (
"bytes"
"net/url"
"testing"
"golang.org/x/net/idna"
)
func TestDomainPrefixBasic(t *testing.T) {
// Tests expecting no error.
for _, test := range []struct {
domain, expected string
}{
{"", ""},
{"xn--", ""},
{"...", "---"},
// Should not apply mappings such as case folding and
// normalization.
{"b\u00fccher.de", "xn--bcher-de-65a"},
{"B\u00fccher.de", "xn--Bcher-de-65a"},
{"bu\u0308cher.de", "xn--bucher-de-hkf"},
// Check some that differ between IDNA 2003 and IDNA 2008.
// https://unicode.org/reports/tr46/#Deviations
// https://util.unicode.org/UnicodeJsps/idna.jsp
{"faß.de", "xn--fa-de-mqa"},
{"βόλοσ.com", "xn---com-4ld8c2a6a8e"},
// Lengths of 63 and 64. 64 is too long for a DNS label, but
// domainPrefixBasic is not expected to check for that.
{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"},
{"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"},
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#basic-algorithm
{"example.com", "example-com"},
{"foo.example.com", "foo-example-com"},
{"foo-example.com", "foo--example-com"},
{"xn--57hw060o.com", "xn---com-p33b41770a"},
{"\u26a1\U0001f60a.com", "xn---com-p33b41770a"},
{"en-us.example.com", "0-en--us-example-com-0"},
} {
output, err := domainPrefixBasic(test.domain)
if err != nil || output != test.expected {
t.Errorf("%+q → (%+q, %v), expected (%+q, %v)",
test.domain, output, err, test.expected, nil)
}
}
// Tests expecting an error.
for _, domain := range []string{
"xn---",
} {
output, err := domainPrefixBasic(domain)
if err == nil || output != "" {
t.Errorf("%+q → (%+q, %v), expected (%+q, non-nil)",
domain, output, err, "")
}
}
}
func TestDomainPrefixFallback(t *testing.T) {
for _, test := range []struct {
domain, expected string
}{
{
"",
"4oymiquy7qobjgx36tejs35zeqt24qpemsnzgtfeswmrw6csxbkq",
},
{
"example.com",
"un42n5xov642kxrxrqiyanhcoupgql5lt4wtbkyt2ijflbwodfdq",
},
// These checked against the output of
// https://github.com/ampproject/amp-toolbox/tree/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url,
// using the widget at
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#url-format.
{
"000000000000000000000000000000000000000000000000000000000000.com",
"stejanx4hsijaoj4secyecy4nvqodk56kw72whwcmvdbtucibf5a",
},
{
"00000000000000000000000000000000000000000000000000000000000a.com",
"jdcvbsorpnc3hcjrhst56nfm6ymdpovlawdbm2efyxpvlt4cpbya",
},
{
"00000000000000000000000000000000000000000000000000000000000\u03bb.com",
"qhzqeumjkfpcpuic3vqruyjswcr7y7gcm3crqyhhywvn3xrhchfa",
},
} {
output := domainPrefixFallback(test.domain)
if output != test.expected {
t.Errorf("%+q → %+q, expected %+q",
test.domain, output, test.expected)
}
}
}
// Checks that domainPrefix chooses domainPrefixBasic or domainPrefixFallback as
// appropriate; i.e., always returns string that is a valid DNS label and is
// IDNA-decodable.
func TestDomainPrefix(t *testing.T) {
// A validating IDNA profile, which checks label length and that the
// label contains only certain ASCII characters. It does not do the
// ValidateLabels check, because that depends on the input having
// certain properties.
profile := idna.New(
idna.VerifyDNSLength(true),
idna.StrictDomainName(true),
)
for _, domain := range []string{
"example.com",
"\u0314example.com",
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 63 bytes
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", // 64 bytes
"xn--57hw060o.com",
"a b c",
} {
output := domainPrefix(domain)
if bytes.IndexByte([]byte(output), '.') != -1 {
t.Errorf("%+q → %+q contains a dot", domain, output)
}
_, err := profile.ToUnicode(output)
if err != nil {
t.Errorf("%+q → error %v", domain, err)
}
}
}
func mustParseURL(rawurl string) *url.URL {
u, err := url.Parse(rawurl)
if err != nil {
panic(err)
}
return u
}
func TestCacheURL(t *testing.T) {
// Tests expecting no error.
for _, test := range []struct {
pub string
cache string
contentType string
expected string
}{
// With or without trailing slash on pubURL.
{
"http://example.com/",
"https://amp.cache/",
"c",
"https://example-com.amp.cache/c/example.com",
},
{
"http://example.com",
"https://amp.cache/",
"c",
"https://example-com.amp.cache/c/example.com",
},
// https pubURL.
{
"https://example.com/",
"https://amp.cache/",
"c",
"https://example-com.amp.cache/c/s/example.com",
},
// The content type should be escaped if necessary.
{
"http://example.com/",
"https://amp.cache/",
"/",
"https://example-com.amp.cache/%2F/example.com",
},
// Retain pubURL path, query, and fragment, including escaping.
{
"http://example.com/my%2Fpath/index.html?a=1#fragment",
"https://amp.cache/",
"c",
"https://example-com.amp.cache/c/example.com/my%2Fpath/index.html?a=1#fragment",
},
// Retain scheme, userinfo, port, and path of cacheURL, escaping
// whatever is necessary.
{
"http://example.com",
"http://cache%2Fuser:cache%40pass@amp.cache:123/with/../../path/..%2f../",
"c",
"http://cache%2Fuser:cache%40pass@example-com.amp.cache:123/path/..%2f../c/example.com",
},
// Port numbers in pubURL are allowed, if they're the default
// for scheme.
{
"http://example.com:80/",
"https://amp.cache/",
"c",
"https://example-com.amp.cache/c/example.com",
},
{
"https://example.com:443/",
"https://amp.cache/",
"c",
"https://example-com.amp.cache/c/s/example.com",
},
// "?" at the end of cacheURL is okay, as long as the query is
// empty.
{
"http://example.com/",
"https://amp.cache/?",
"c",
"https://example-com.amp.cache/c/example.com",
},
// https://developers.google.com/amp/cache/overview#example-requesting-document-using-tls
{
"https://example.com/amp_document.html",
"https://cdn.ampproject.org/",
"c",
"https://example-com.cdn.ampproject.org/c/s/example.com/amp_document.html",
},
// https://developers.google.com/amp/cache/overview#example-requesting-image-using-plain-http
{
"http://example.com/logo.png",
"https://cdn.ampproject.org/",
"i",
"https://example-com.cdn.ampproject.org/i/example.com/logo.png",
},
// https://developers.google.com/amp/cache/overview#query-parameter-example
{
"https://example.com/g?value=Hello%20World",
"https://cdn.ampproject.org/",
"c",
"https://example-com.cdn.ampproject.org/c/s/example.com/g?value=Hello%20World",
},
} {
pubURL := mustParseURL(test.pub)
cacheURL := mustParseURL(test.cache)
outputURL, err := CacheURL(pubURL, cacheURL, test.contentType)
if err != nil {
t.Errorf("%+q %+q %+q → error %v",
test.pub, test.cache, test.contentType, err)
continue
}
if outputURL.String() != test.expected {
t.Errorf("%+q %+q %+q → %+q, expected %+q",
test.pub, test.cache, test.contentType, outputURL, test.expected)
continue
}
}
// Tests expecting an error.
for _, test := range []struct {
pub string
cache string
contentType string
}{
// Empty content type.
{
"http://example.com/",
"https://amp.cache/",
"",
},
// Empty host.
{
"http:///index.html",
"https://amp.cache/",
"c",
},
// Empty scheme.
{
"//example.com/",
"https://amp.cache/",
"c",
},
// Unrecognized scheme.
{
"ftp://example.com/",
"https://amp.cache/",
"c",
},
// Wrong port number for scheme.
{
"http://example.com:443/",
"https://amp.cache/",
"c",
},
// userinfo in pubURL.
{
"http://user@example.com/",
"https://amp.cache/",
"c",
},
{
"http://user:pass@example.com/",
"https://amp.cache/",
"c",
},
// cacheURL may not contain a query.
{
"http://example.com/",
"https://amp.cache/?a=1",
"c",
},
// cacheURL may not contain a fragment.
{
"http://example.com/",
"https://amp.cache/#fragment",
"c",
},
} {
pubURL := mustParseURL(test.pub)
cacheURL := mustParseURL(test.cache)
outputURL, err := CacheURL(pubURL, cacheURL, test.contentType)
if err == nil {
t.Errorf("%+q %+q %+q → %+q, expected error",
test.pub, test.cache, test.contentType, outputURL)
continue
}
}
}

88
common/amp/doc.go Normal file
View file

@ -0,0 +1,88 @@
/*
Package amp provides functions for working with the AMP (Accelerated Mobile
Pages) subset of HTML, and conveying binary data through an AMP cache.
AMP cache
The CacheURL function takes a plain URL and converts it to be accessed through a
given AMP cache.
The EncodePath and DecodePath functions provide a way to encode data into the
suffix of a URL path. AMP caches do not support HTTP POST, but encoding data
into a URL path with GET is an alternative means of sending data to the server.
The format of an encoded path is:
0<0 or more bytes, including slash>/<base64 of data>
That is:
* "0", a format version number, which controls the interpretation of the rest of
the path. Only the first byte matters as a version indicator (not the whole
first path component).
* Any number of slash or non-slash bytes. These may be used as padding or to
prevent cache collisions in the AMP cache.
* A final slash.
* base64 encoding of the data, using the URL-safe alphabet (which does not
include slash).
For example, an encoding of the string "This is path-encoded data." is the
following. The "lgWHcwhXFjUm" following the format version number is random
padding that will be ignored on decoding.
0lgWHcwhXFjUm/VGhpcyBpcyBwYXRoLWVuY29kZWQgZGF0YS4
It is the caller's responsibility to add or remove any directory path prefix
before calling EncodePath or DecodePath.
AMP armor
AMP armor is a data encoding scheme that that satisfies the requirements of the
AMP (Accelerated Mobile Pages) subset of HTML, and survives modification by an
AMP cache. For the requirements of AMP HTML, see
https://amp.dev/documentation/guides-and-tutorials/learn/spec/amphtml/.
For modifications that may be made by an AMP cache, see
https://github.com/ampproject/amphtml/blob/main/docs/spec/amp-cache-modifications.md.
The encoding is based on ones created by Ivan Markin. See codec/amp/ in
https://github.com/nogoegst/amper and discussion at
https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985.
The encoding algorithm works as follows. Base64-encode the input. Prepend the
input with the byte '0'; this is a protocol version indicator that the decoder
can use to determine how to interpret the bytes that follow. Split the base64
into fixed-size chunks separated by whitespace. Take up to 1024 chunks at a
time, and wrap them in a pre element. Then, situate the markup so far within the
body of the AMP HTML boilerplate. The decoding algorithm is to scan the HTML for
pre elements, split their text contents on whitespace and concatenate, then
base64 decode. The base64 encoding uses the standard alphabet, with normal "="
padding (https://tools.ietf.org/html/rfc4648#section-4).
The reason for splitting the base64 into chunks is that AMP caches reportedly
truncate long strings that are not broken by whitespace:
https://bugs.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake/25985#note_2592348.
The characters that may separate the chunks are the ASCII whitespace characters
(https://infra.spec.whatwg.org/#ascii-whitespace) "\x09", "\x0a", "\x0c",
"\x0d", and "\x20". The reason for separating the chunks into pre elements is to
limit the amount of text a decoder may have to buffer while parsing the HTML.
Each pre element may contain at most 64 KB of text. pre elements may not be
nested.
Example
The following is the result of encoding the string
"This was encoded with AMP armor.":
<!doctype html>
<html amp>
<head>
<meta charset="utf-8">
<script async src="https://cdn.ampproject.org/v0.js"></script>
<link rel="canonical" href="#">
<meta name="viewport" content="width=device-width">
<style amp-boilerplate>body{-webkit-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-moz-animation:-amp-start 8s steps(1,end) 0s 1 normal both;-ms-animation:-amp-start 8s steps(1,end) 0s 1 normal both;animation:-amp-start 8s steps(1,end) 0s 1 normal both}@-webkit-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-moz-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-ms-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@-o-keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}@keyframes -amp-start{from{visibility:hidden}to{visibility:visible}}</style><noscript><style amp-boilerplate>body{-webkit-animation:none;-moz-animation:none;-ms-animation:none;animation:none}</style></noscript>
</head>
<body>
<pre>
0VGhpcyB3YXMgZW5jb2RlZCB3aXRoIEF
NUCBhcm1vci4=
</pre>
</body>
</html>
*/
package amp

44
common/amp/path.go Normal file
View file

@ -0,0 +1,44 @@
package amp
import (
"crypto/rand"
"encoding/base64"
"fmt"
"strings"
)
// EncodePath encodes data in a way that is suitable for the suffix of an AMP
// cache URL.
func EncodePath(data []byte) string {
var cacheBreaker [9]byte
_, err := rand.Read(cacheBreaker[:])
if err != nil {
panic(err)
}
b64 := base64.RawURLEncoding.EncodeToString
return "0" + b64(cacheBreaker[:]) + "/" + b64(data)
}
// DecodePath decodes data from a path suffix as encoded by EncodePath. The path
// must have already been trimmed of any directory prefix (as might be present
// in, e.g., an HTTP request). That is, the first character of path should be
// the "0" message format indicator.
func DecodePath(path string) ([]byte, error) {
if len(path) < 1 {
return nil, fmt.Errorf("missing format indicator")
}
version := path[0]
rest := path[1:]
switch version {
case '0':
// Ignore everything else up to and including the final slash
// (there must be at least one slash).
i := strings.LastIndexByte(rest, '/')
if i == -1 {
return nil, fmt.Errorf("missing data")
}
return base64.RawURLEncoding.DecodeString(rest[i+1:])
default:
return nil, fmt.Errorf("unknown format indicator %q", version)
}
}

54
common/amp/path_test.go Normal file
View file

@ -0,0 +1,54 @@
package amp
import (
"testing"
)
func TestDecodePath(t *testing.T) {
for _, test := range []struct {
path string
expectedData string
expectedErrStr string
}{
{"", "", "missing format indicator"},
{"0", "", "missing data"},
{"0foobar", "", "missing data"},
{"/0/YWJj", "", "unknown format indicator '/'"},
{"0/", "", ""},
{"0foobar/", "", ""},
{"0/YWJj", "abc", ""},
{"0///YWJj", "abc", ""},
{"0foobar/YWJj", "abc", ""},
{"0/foobar/YWJj", "abc", ""},
} {
data, err := DecodePath(test.path)
if test.expectedErrStr != "" {
if err == nil || err.Error() != test.expectedErrStr {
t.Errorf("%+q expected error %+q, got %+q",
test.path, test.expectedErrStr, err)
}
} else if err != nil {
t.Errorf("%+q expected no error, got %+q", test.path, err)
} else if string(data) != test.expectedData {
t.Errorf("%+q expected data %+q, got %+q",
test.path, test.expectedData, data)
}
}
}
func TestPathRoundTrip(t *testing.T) {
for _, data := range []string{
"",
"\x00",
"/",
"hello world",
} {
decoded, err := DecodePath(EncodePath([]byte(data)))
if err != nil {
t.Errorf("%+q roundtripped with error %v", data, err)
} else if string(decoded) != data {
t.Errorf("%+q roundtripped to %+q", data, decoded)
}
}
}