mirror of
https://gitlab.torproject.org/tpo/anti-censorship/pluggable-transports/snowflake.git
synced 2025-10-14 14:11:23 -04:00
This package contains a CacheURL function that modifies a URL to be accessed through an AMP cache, and the "AMP armor" data encoding scheme for encoding data into the AMP subset of HTML.
178 lines
7.1 KiB
Go
178 lines
7.1 KiB
Go
package amp
|
|
|
|
import (
|
|
"crypto/sha256"
|
|
"encoding/base32"
|
|
"fmt"
|
|
"net"
|
|
"net/url"
|
|
"path"
|
|
"strings"
|
|
|
|
"golang.org/x/net/idna"
|
|
)
|
|
|
|
// domainPrefixBasic does the basic domain prefix conversion. Does not do any
|
|
// IDNA mapping, such as https://www.unicode.org/reports/tr46/.
|
|
//
|
|
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#basic-algorithm
|
|
func domainPrefixBasic(domain string) (string, error) {
|
|
// 1. Punycode Decode the publisher domain.
|
|
prefix, err := idna.ToUnicode(domain)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
// 2. Replace any "-" (hyphen) character in the output of step 1 with
|
|
// "--" (two hyphens).
|
|
prefix = strings.Replace(prefix, "-", "--", -1)
|
|
|
|
// 3. Replace any "." (dot) character in the output of step 2 with "-"
|
|
// (hyphen).
|
|
prefix = strings.Replace(prefix, ".", "-", -1)
|
|
|
|
// 4. If the output of step 3 has a "-" (hyphen) at both positions 3 and
|
|
// 4, then to the output of step 3, add a prefix of "0-" and add a
|
|
// suffix of "-0".
|
|
if len(prefix) >= 4 && prefix[2] == '-' && prefix[3] == '-' {
|
|
prefix = "0-" + prefix + "-0"
|
|
}
|
|
|
|
// 5. Punycode Encode the output of step 3.
|
|
return idna.ToASCII(prefix)
|
|
}
|
|
|
|
// Lower-case base32 without padding.
|
|
var fallbackBase32Encoding = base32.NewEncoding("abcdefghijklmnopqrstuvwxyz234567").WithPadding(base32.NoPadding)
|
|
|
|
// domainPrefixFallback does the fallback domain prefix conversion. The returned
|
|
// base32 domain uses lower-case letters.
|
|
//
|
|
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#fallback-algorithm
|
|
func domainPrefixFallback(domain string) string {
|
|
// The algorithm specification does not say what, exactly, we are to
|
|
// take the SHA-256 of. domain is notionally an abstract Unicode
|
|
// string, not a byte sequence. While
|
|
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L62
|
|
// says "Take the SHA256 of the punycode view of the domain," in reality
|
|
// it hashes the UTF-8 encoding of the domain, without Punycode:
|
|
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/AmpCurlUrlGenerator.js#L141
|
|
// https://github.com/ampproject/amp-toolbox/blob/84cb3057e5f6c54d64369ddd285db1cb36237ee8/packages/cache-url/lib/browser/Sha256.js#L24
|
|
// We do the same here, hashing the raw bytes of domain, presumed to be
|
|
// UTF-8.
|
|
|
|
// 1. Hash the publisher's domain using SHA256.
|
|
h := sha256.Sum256([]byte(domain))
|
|
|
|
// 2. Base32 Escape the output of step 1.
|
|
// 3. Remove the last 4 characters from the output of step 2, which are
|
|
// always "=" (equals) characters.
|
|
return fallbackBase32Encoding.EncodeToString(h[:])
|
|
}
|
|
|
|
// domainPrefix computes the domain prefix of an AMP cache URL.
|
|
//
|
|
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#domain-name-prefix
|
|
func domainPrefix(domain string) string {
|
|
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#combined-algorithm
|
|
// 1. Run the Basic Algorithm. If the output is a valid DNS label,
|
|
// [append the Cache domain suffix and] return. Otherwise continue to
|
|
// step 2.
|
|
prefix, err := domainPrefixBasic(domain)
|
|
// "A domain prefix is not a valid DNS label if it is longer than 63
|
|
// characters"
|
|
if err == nil && len(prefix) <= 63 {
|
|
return prefix
|
|
}
|
|
// 2. Run the Fallback Algorithm. [Append the Cache domain suffix and]
|
|
// return.
|
|
return domainPrefixFallback(domain)
|
|
}
|
|
|
|
// CacheURL computes the AMP cache URL for the publisher URL pubURL, using the
|
|
// AMP cache at cacheURL. contentType is a string such as "c" or "i" that
|
|
// indicates what type of serving the AMP cache is to perform. The Scheme of
|
|
// pubURL must be "http" or "https". The Port of pubURL, if any, must match the
|
|
// default for the scheme. cacheURL may not have RawQuery, Fragment, or
|
|
// RawFragment set, because the resulting URL's query and fragment are taken
|
|
// from the publisher URL.
|
|
//
|
|
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/
|
|
func CacheURL(pubURL, cacheURL *url.URL, contentType string) (*url.URL, error) {
|
|
// The cache URL subdomain, including the domain prefix corresponding to
|
|
// the publisher URL's domain.
|
|
resultHost := domainPrefix(pubURL.Hostname()) + "." + cacheURL.Hostname()
|
|
if cacheURL.Port() != "" {
|
|
resultHost = net.JoinHostPort(resultHost, cacheURL.Port())
|
|
}
|
|
|
|
// https://amp.dev/documentation/guides-and-tutorials/learn/amp-caches-and-cors/amp-cache-urls/#url-path
|
|
// The first part of the path is the cache URL's own path, if any.
|
|
pathComponents := []string{cacheURL.EscapedPath()}
|
|
// The next path component is the content type. We cannot encode an
|
|
// empty content type, because it would result in consecutive path
|
|
// separators, which would semantically combine into a single separator.
|
|
if contentType == "" {
|
|
return nil, fmt.Errorf("invalid content type %+q", contentType)
|
|
}
|
|
pathComponents = append(pathComponents, url.PathEscape(contentType))
|
|
// Then, we add an "s" path component, if the publisher URL scheme is
|
|
// "https".
|
|
switch pubURL.Scheme {
|
|
case "http":
|
|
// Do nothing.
|
|
case "https":
|
|
pathComponents = append(pathComponents, "s")
|
|
default:
|
|
return nil, fmt.Errorf("invalid scheme %+q in publisher URL", pubURL.Scheme)
|
|
}
|
|
// The next path component is the publisher URL's host. The AMP cache
|
|
// URL format specification is not clear about whether other
|
|
// subcomponents of the authority (namely userinfo and port) may appear
|
|
// here. We adopt a policy of forbidding userinfo, and requiring that
|
|
// the port be the default for the scheme (and then we omit the port
|
|
// entirely from the returned URL).
|
|
if pubURL.User != nil {
|
|
return nil, fmt.Errorf("publisher URL may not contain userinfo")
|
|
}
|
|
if port := pubURL.Port(); port != "" {
|
|
if !((pubURL.Scheme == "http" && port == "80") || (pubURL.Scheme == "https" && port == "443")) {
|
|
return nil, fmt.Errorf("publisher URL port %+q is not the default for scheme %+q", port, pubURL.Scheme)
|
|
}
|
|
}
|
|
// As with the content type, we cannot encode an empty host, because
|
|
// that would result in an empty path component.
|
|
if pubURL.Hostname() == "" {
|
|
return nil, fmt.Errorf("invalid host %+q in publisher URL", pubURL.Hostname())
|
|
}
|
|
pathComponents = append(pathComponents, url.PathEscape(pubURL.Hostname()))
|
|
// Finally, we append the remainder of the original escaped path from
|
|
// the publisher URL.
|
|
pathComponents = append(pathComponents, pubURL.EscapedPath())
|
|
|
|
resultRawPath := path.Join(pathComponents...)
|
|
resultPath, err := url.PathUnescape(resultRawPath)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// The query and fragment of the returned URL always come from pubURL.
|
|
// Any query or fragment of cacheURL would be ignored. Return an error
|
|
// if either is set.
|
|
if cacheURL.RawQuery != "" {
|
|
return nil, fmt.Errorf("cache URL may not contain a query")
|
|
}
|
|
if cacheURL.Fragment != "" {
|
|
return nil, fmt.Errorf("cache URL may not contain a fragment")
|
|
}
|
|
|
|
return &url.URL{
|
|
Scheme: cacheURL.Scheme,
|
|
User: cacheURL.User,
|
|
Host: resultHost,
|
|
Path: resultPath,
|
|
RawPath: resultRawPath,
|
|
RawQuery: pubURL.RawQuery,
|
|
Fragment: pubURL.Fragment,
|
|
}, nil
|
|
}
|