Skip to content

Commit 50c9eeb

Browse files
committed
Don't decompress gzip if data doesn't look like gzip
Prevents incorrect response being returned in cases like /sitemap.xml.gz is requested, but uncompressed 404 page is served instead.
1 parent 9ef9229 commit 50c9eeb

File tree

2 files changed

+53
-5
lines changed

2 files changed

+53
-5
lines changed

colly_test.go

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,15 @@ Disallow: /disallowed
4444
Disallow: /allowed*q=
4545
`
4646

47-
const testXml = `<?xml version="1.0" encoding="UTF-8"?>
47+
const testXML = `<?xml version="1.0" encoding="UTF-8"?>
4848
<page>
4949
<title>Test Page</title>
5050
<paragraph type="description">This is a test page</paragraph>
5151
<paragraph type="description">This is a test paragraph</paragraph>
5252
</page>`
5353

54+
const custom404 = `404 not found`
55+
5456
func newTestServer() *httptest.Server {
5557
mux := http.NewServeMux()
5658

@@ -77,13 +79,17 @@ func newTestServer() *httptest.Server {
7779

7880
mux.HandleFunc("/xml", func(w http.ResponseWriter, r *http.Request) {
7981
w.Header().Set("Content-Type", "application/xml")
80-
w.Write([]byte(testXml))
82+
w.Write([]byte(testXML))
8183
})
8284

8385
mux.HandleFunc("/test.xml.gz", func(w http.ResponseWriter, r *http.Request) {
8486
ww := gzip.NewWriter(w)
8587
defer ww.Close()
86-
ww.Write([]byte(testXml))
88+
ww.Write([]byte(testXML))
89+
})
90+
91+
mux.HandleFunc("/nonexistent.xml.gz", func(w http.ResponseWriter, r *http.Request) {
92+
http.Error(w, custom404, http.StatusNotFound)
8793
})
8894

8995
mux.HandleFunc("/login", func(w http.ResponseWriter, r *http.Request) {
@@ -1431,6 +1437,35 @@ func TestCollectorOnXMLWithXMLCompressed(t *testing.T) {
14311437
testCollectorOnXMLWithXML(t, "/test.xml.gz")
14321438
}
14331439

1440+
func TestCollectorNonexistentXMLGZ(t *testing.T) {
1441+
// This is a regression test for colly
1442+
// attempting to decompress all .xml.gz URLs
1443+
// even if they're not compressed.
1444+
ts := newTestServer()
1445+
defer ts.Close()
1446+
1447+
c := NewCollector(ParseHTTPErrorResponse())
1448+
1449+
onResponseCalled := false
1450+
1451+
c.OnResponse(func(resp *Response) {
1452+
onResponseCalled = true
1453+
if got, want := strings.TrimSpace(string(resp.Body)), custom404; got != want {
1454+
t.Errorf("wrong response body got=%q want=%q", got, want)
1455+
}
1456+
})
1457+
1458+
c.OnError(func(resp *Response, err error) {
1459+
t.Errorf("called on OnError: err=%v", err)
1460+
})
1461+
1462+
c.Visit(ts.URL + "/nonexistent.xml.gz")
1463+
1464+
if !onResponseCalled {
1465+
t.Error("OnResponse was not called")
1466+
}
1467+
}
1468+
14341469
func TestCollectorVisitWithTrace(t *testing.T) {
14351470
ts := newTestServer()
14361471
defer ts.Close()

http_backend.go

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
package colly
1616

1717
import (
18+
"bufio"
1819
"crypto/sha1"
1920
"encoding/gob"
2021
"encoding/hex"
@@ -201,11 +202,23 @@ func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc c
201202
}
202203
contentEncoding := strings.ToLower(res.Header.Get("Content-Encoding"))
203204
if !res.Uncompressed && (strings.Contains(contentEncoding, "gzip") || (contentEncoding == "" && strings.Contains(strings.ToLower(res.Header.Get("Content-Type")), "gzip")) || strings.HasSuffix(strings.ToLower(request.URL.Path), ".xml.gz")) {
204-
bodyReader, err = gzip.NewReader(bodyReader)
205+
// Even if URL contains .xml.gz, it doesn't mean that we get gzip
206+
// compressed data back. We might get 404 error page instead,
207+
// for example. So check gzip magic bytes.
208+
bufReader := bufio.NewReader(bodyReader)
209+
bodyReader = bufReader
210+
magic, err := bufReader.Peek(2)
205211
if err != nil {
206212
return nil, err
207213
}
208-
defer bodyReader.(*gzip.Reader).Close()
214+
// gzip magic, as specified in RFC 1952
215+
if magic[0] == 0x1f && magic[1] == 0x8b {
216+
bodyReader, err = gzip.NewReader(bufReader)
217+
if err != nil {
218+
return nil, err
219+
}
220+
defer bodyReader.(*gzip.Reader).Close()
221+
}
209222
}
210223
body, err := ioutil.ReadAll(bodyReader)
211224
if err != nil {

0 commit comments

Comments
 (0)