diff --git a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md index 7eb6f66185..db62d6394a 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/09_getting_links.md @@ -43,16 +43,15 @@ if (response.ok) { const html = await response.text(); const $ = cheerio.load(html); - const data = []; - $(".product-item").each((i, element) => { - const productItem = $(element); + const $items = $(".product-item").map((i, element) => { + const $productItem = $(element); - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price + const priceText = $price .text() .trim() .replace("$", "") @@ -66,8 +65,9 @@ if (response.ok) { priceRange.price = priceRange.minPrice; } - data.push({ title: titleText, ...priceRange }); + return { title, ...priceRange }; }); + const data = $items.get(); const jsonData = JSON.stringify(data); await writeFile('products.json', jsonData); @@ -97,13 +97,13 @@ async function download(url) { Next, we can put parsing into a `parseProduct()` function, which takes the product item element and returns the object with data: ```js -function parseProduct(productItem) { - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); +function parseProduct($productItem) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price + const priceText = $price .text() .trim() .replace("$", "") @@ -117,24 +117,18 @@ function parseProduct(productItem) { priceRange.price = priceRange.minPrice; } - return { title: titleText, ...priceRange }; + return { title, ...priceRange }; } ``` Now the JSON export. For better readability, let's make a small change here and set the indentation level to two spaces: ```js -async function exportJSON(data) { +function exportJSON(data) { return JSON.stringify(data, null, 2); } ``` -:::note Why asynchronous? - -The `exportJSON()` function doesn't need to be `async` now, but keeping it makes future changes easier — like switching to an async JSON parser. It also stays consistent with the upcoming `exportCSV()` function, which must be asynchronous. - -::: - The last function we'll add will take care of the CSV export: ```js @@ -161,13 +155,13 @@ async function download(url) { } } -function parseProduct(productItem) { - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); +function parseProduct($productItem) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); - const price = productItem.find(".price").contents().last(); + const $price = $productItem.find(".price").contents().last(); const priceRange = { minPrice: null, price: null }; - const priceText = price + const priceText = $price .text() .trim() .replace("$", "") @@ -181,10 +175,10 @@ function parseProduct(productItem) { priceRange.price = priceRange.minPrice; } - return { title: titleText, ...priceRange }; + return { title, ...priceRange }; } -async function exportJSON(data) { +function exportJSON(data) { return JSON.stringify(data, null, 2); } @@ -196,14 +190,14 @@ async function exportCSV(data) { const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" const $ = await download(listingURL); -const data = [] -$(".product-item").each((i, element) => { - const productItem = $(element); - const item = parseProduct(productItem); - data.push(item); +const $items = $(".product-item").map((i, element) => { + const $productItem = $(element); + const item = parseProduct($productItem); + return item; }); +const data = $items.get(); -await writeFile('products.json', await exportJSON(data)); +await writeFile('products.json', exportJSON(data)); await writeFile('products.csv', await exportCSV(data)); ``` @@ -232,14 +226,14 @@ Several methods exist for transitioning from one page to another, but the most c In DevTools, we can see that each product title is, in fact, also a link element. We already locate the titles, so that makes our task easier. We just need to edit the code so that it extracts not only the text of the element but also the `href` attribute. Cheerio selections support accessing attributes using the `.attr()` method: ```js -function parseProduct(productItem) { - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); - const url = title.attr("href"); +function parseProduct($productItem) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); + const url = $title.attr("href"); ... - return { url, title: titleText, ...priceRange }; + return { url, title, ...priceRange }; } ``` @@ -274,15 +268,15 @@ We'll change the `parseProduct()` function so that it also takes the base URL as ```js // highlight-next-line -function parseProduct(productItem, baseURL) { - const title = productItem.find(".product-item__title"); - const titleText = title.text().trim(); +function parseProduct($productItem, baseURL) { + const $title = $productItem.find(".product-item__title"); + const title = $title.text().trim(); // highlight-next-line - const url = new URL(title.attr("href"), baseURL).href; + const url = new URL($title.attr("href"), baseURL).href; ... - return { url, title: titleText, ...priceRange }; + return { url, title, ...priceRange }; } ``` @@ -292,13 +286,13 @@ Now we'll pass the base URL to the function in the main body of our program: const listingURL = "https://warehouse-theme-metal.myshopify.com/collections/sales" const $ = await download(listingURL); -const data = [] -$(".product-item").each((i, element) => { - const productItem = $(element); +const $items = $(".product-item").map((i, element) => { + const $productItem = $(element); // highlight-next-line - const item = parseProduct(productItem, listingURL); - data.push(item); + const item = parseProduct($productItem, listingURL); + return item; }); +const data = $items.get(); ``` When we run the scraper now, we should see full URLs in our exports: