diff --git a/aklapi.go b/aklapi.go index 8e1f0fb..327e3c2 100644 --- a/aklapi.go +++ b/aklapi.go @@ -10,9 +10,16 @@ var ( ) // userAgent is sent with all outgoing HTTP requests. The Auckland Council -// website CDN (Fastly) returns 406 for requests that identify as Go's default -// http client, so we send a browser-compatible value instead. -const userAgent = "Mozilla/5.0 (compatible; aklapi/1.0)" +// collection page returns 406 unless requests resemble a modern browser. +const userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36" + +const ( + defaultAccept = "application/json, text/html, */*" + browserAccept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8" + browserAcceptLanguage = "en-NZ,en;q=0.9,en-US;q=0.8" + secCHUA = "\"Chromium\";v=\"135\", \"Not.A/Brand\";v=\"8\"" + secCHUAPlatform = "\"macOS\"" +) // browserTransport is an http.RoundTripper that adds browser-like headers to // every request before forwarding it to the underlying transport. @@ -27,7 +34,30 @@ func (t *browserTransport) RoundTrip(req *http.Request) (*http.Response, error) r.Header.Set("User-Agent", userAgent) } if r.Header.Get("Accept") == "" { - r.Header.Set("Accept", "application/json, text/html, */*") + r.Header.Set("Accept", defaultAccept) } - return t.wrapped.RoundTrip(r) + if r.Header.Get("Accept-Language") == "" { + r.Header.Set("Accept-Language", browserAcceptLanguage) + } + + wrapped := t.wrapped + if wrapped == nil { + wrapped = http.DefaultTransport + } + return wrapped.RoundTrip(r) +} + +// setBrowserDocumentHeaders applies the navigation-style headers required by +// the collection HTML page. +func setBrowserDocumentHeaders(r *http.Request) { + r.Header.Set("Accept", browserAccept) + r.Header.Set("Cache-Control", "max-age=0") + r.Header.Set("Upgrade-Insecure-Requests", "1") + r.Header.Set("Sec-Fetch-Site", "none") + r.Header.Set("Sec-Fetch-Mode", "navigate") + r.Header.Set("Sec-Fetch-User", "?1") + r.Header.Set("Sec-Fetch-Dest", "document") + r.Header.Set("Sec-CH-UA", secCHUA) + r.Header.Set("Sec-CH-UA-Mobile", "?0") + r.Header.Set("Sec-CH-UA-Platform", secCHUAPlatform) } diff --git a/rubbish.go b/rubbish.go index 6722a2c..6164307 100644 --- a/rubbish.go +++ b/rubbish.go @@ -120,12 +120,14 @@ func fetchandparse(ctx context.Context, addressID string) (*CollectionDayDetailR if err != nil { return nil, err } + setBrowserDocumentHeaders(req) resp, err := collectionHTTPClient.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { + slog.WarnContext(ctx, "collection request failed", "status", resp.StatusCode, "url", req.URL.String()) return nil, fmt.Errorf("collection API returned status code: %d", resp.StatusCode) } return parse(resp.Body) diff --git a/rubbish_test.go b/rubbish_test.go index 97d2441..4967cb9 100644 --- a/rubbish_test.go +++ b/rubbish_test.go @@ -186,6 +186,149 @@ func TestFetchAndParse_StatusCodeError(t *testing.T) { } } +func TestCollectionRequestHeaders(t *testing.T) { + t.Cleanup(func() { + NoCache = false + }) + NoCache = true + + t.Run("fetchandparse uses browser headers", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if got := r.Header.Get("User-Agent"); got != userAgent { + http.Error(w, "missing browser user-agent", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Accept"); got != browserAccept { + http.Error(w, "missing browser accept", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Accept-Language"); got != browserAcceptLanguage { + http.Error(w, "missing browser accept-language", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Cache-Control"); got != "max-age=0" { + http.Error(w, "missing cache-control", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Upgrade-Insecure-Requests"); got != "1" { + http.Error(w, "missing upgrade-insecure-requests", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Sec-Fetch-Site"); got != "none" { + http.Error(w, "missing sec-fetch-site", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Sec-Fetch-Mode"); got != "navigate" { + http.Error(w, "missing sec-fetch-mode", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Sec-Fetch-User"); got != "?1" { + http.Error(w, "missing sec-fetch-user", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Sec-Fetch-Dest"); got != "document" { + http.Error(w, "missing sec-fetch-dest", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Sec-CH-UA"); got != secCHUA { + http.Error(w, "missing sec-ch-ua", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Sec-CH-UA-Mobile"); got != "?0" { + http.Error(w, "missing sec-ch-ua-mobile", http.StatusNotAcceptable) + return + } + if got := r.Header.Get("Sec-CH-UA-Platform"); got != secCHUAPlatform { + http.Error(w, "missing sec-ch-ua-platform", http.StatusNotAcceptable) + return + } + w.Write([]byte(taRsd1LuandaDrive)) + })) + t.Cleanup(srv.Close) + + oldURI := collectionDayURI + oldClient := collectionHTTPClient + t.Cleanup(func() { + collectionDayURI = oldURI + collectionHTTPClient = oldClient + }) + + collectionDayURI = srv.URL + "/rubbish/%s" + collectionHTTPClient = &http.Client{ + Timeout: time.Second, + Transport: &browserTransport{wrapped: srv.Client().Transport}, + } + + got, err := fetchandparse(t.Context(), "42") + if err != nil { + t.Fatalf("fetchandparse() error = %v", err) + } + if got == nil { + t.Fatal("fetchandparse() returned nil result") + } + }) + + t.Run("collection day detail recovers from 406 with browser headers", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch r.URL.Path { + case "/addr": + writeAddrJSON(w, AddrResponse{Items: []Address{*testAddr}}) + case "/rubbish/42": + if r.Header.Get("User-Agent") == userAgent && + r.Header.Get("Accept") == browserAccept && + r.Header.Get("Accept-Language") == browserAcceptLanguage && + r.Header.Get("Cache-Control") == "max-age=0" && + r.Header.Get("Upgrade-Insecure-Requests") == "1" && + r.Header.Get("Sec-Fetch-Site") == "none" && + r.Header.Get("Sec-Fetch-Mode") == "navigate" && + r.Header.Get("Sec-Fetch-User") == "?1" && + r.Header.Get("Sec-Fetch-Dest") == "document" && + r.Header.Get("Sec-CH-UA") == secCHUA && + r.Header.Get("Sec-CH-UA-Mobile") == "?0" && + r.Header.Get("Sec-CH-UA-Platform") == secCHUAPlatform { + w.Write([]byte(taRsd1LuandaDrive)) + return + } + http.Error(w, "collection API returned status code: 406", http.StatusNotAcceptable) + default: + http.NotFound(w, r) + } + })) + t.Cleanup(srv.Close) + + oldAddrURI := addrURI + oldCollectionDayURI := collectionDayURI + oldAddrClient := addrHTTPClient + oldCollectionClient := collectionHTTPClient + t.Cleanup(func() { + addrURI = oldAddrURI + collectionDayURI = oldCollectionDayURI + addrHTTPClient = oldAddrClient + collectionHTTPClient = oldCollectionClient + }) + + addrURI = srv.URL + "/addr" + collectionDayURI = srv.URL + "/rubbish/%s" + addrHTTPClient = &http.Client{ + Timeout: time.Second, + Transport: &browserTransport{wrapped: srv.Client().Transport}, + } + collectionHTTPClient = &http.Client{ + Timeout: time.Second, + Transport: &browserTransport{wrapped: srv.Client().Transport}, + } + + got, err := CollectionDayDetail(t.Context(), "500 Queen Street") + if err != nil { + t.Fatalf("CollectionDayDetail() error = %v", err) + } + + assert.NotNil(t, got) + assert.Equal(t, testAddr, got.Address) + assert.Len(t, got.Collections, 3) + }) +} + func TestCollectionDayDetailResult_NextRubbish(t *testing.T) { type fields struct { Collections []RubbishCollection