-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Open
Description
https://pixabay.com/zh/photos/search/?order=ec&pagi=1
we want use colly to get some images from this website.
but we got 403; use postman return 200. why ?
package main
import (
"strconv"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector(
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
colly.Async(true),
)
c.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60"
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 4})
// Find and visit all links
c.OnHTML("script", func(e *colly.HTMLElement) {
if e.Attr("type") == "application/ld+json" {
// parse inner content
content := e.Text
println(content)
} else {
println(e.Text)
}
e.Request.Visit(e.Attr("src"))
})
c.OnRequest(func(r *colly.Request) {
println(r.URL.String())
})
c.OnError(func(r *colly.Response, e error) {
println(r.StatusCode)
println(e.Error())
})
for i := 1; i < 2; i++ {
c.Visit("https://pixabay.com/zh/photos/search/?order=ec&pagi=" + strconv.Itoa(i))
}
c.Wait()
}
[Running] go run "~/pixabay_spider/main.go"
https://pixabay.com/zh/photos/search/?order=ec&pagi=1
403
Forbidden
Metadata
Metadata
Assignees
Labels
No labels