From 4d2824c29b903228a1c1b984f9090b5e77905dca Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 14:21:14 +0200 Subject: [PATCH 1/5] style: change order, first json, then csv Making this change because in Python it doesn't matter and in JavaScript it's easier to start with JSON, which is built-in, and only then move to CSV, which requires an additional library. --- .../webscraping/scraping_basics_python/08_saving_data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 6567e24efa..27656f7f02 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -168,7 +168,7 @@ Bob,42,"reading, TypeScript" In the CSV format, if a value contains commas, we should enclose it in quotes. When we open the file in a text editor of our choice, we can see that the writer automatically handled this. -When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it. If you're using a different operating system, try opening the file with any spreadsheet program you have. +When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. ![CSV example preview](images/csv-example.png) From 0831842df64b36610bd7a834eb9a2ca8d0654411 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 15:41:23 +0200 Subject: [PATCH 2/5] fix: various improvements to the Python lesson about saving data --- .../webscraping/scraping_basics_python/08_saving_data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md index 27656f7f02..6567e24efa 100644 --- a/sources/academy/webscraping/scraping_basics_python/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_python/08_saving_data.md @@ -168,7 +168,7 @@ Bob,42,"reading, TypeScript" In the CSV format, if a value contains commas, we should enclose it in quotes. When we open the file in a text editor of our choice, we can see that the writer automatically handled this. -When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. +When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it. If you're using a different operating system, try opening the file with any spreadsheet program you have. ![CSV example preview](images/csv-example.png) From 03d240814aab3bb627c19f105146c06ad7ba1db3 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 27 Jun 2025 15:41:44 +0200 Subject: [PATCH 3/5] fix: update saving to be about JS --- .../08_saving_data.md | 292 +++++++++--------- 1 file changed, 140 insertions(+), 152 deletions(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index e1ad7365ae..72a6e140a8 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -13,9 +13,9 @@ unlisted: true We managed to scrape data about products and print it, with each product separated by a new line and each field separated by the `|` character. This already produces structured text that can be parsed, i.e., read programmatically. ```text -$ python main.py -JBL Flip 4 Waterproof Portable Bluetooth Speaker | 74.95 | 74.95 -Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 1398.00 | None +$ node index.js +JBL Flip 4 Waterproof Portable Bluetooth Speaker | 7495 | 7495 +Sony XBR-950G BRAVIA 4K HDR Ultra HD TV | 139800 | null ... ``` @@ -27,220 +27,208 @@ We should use widely popular formats that have well-defined solutions for all th Producing results line by line is an efficient approach to handling large datasets, but to simplify this lesson, we'll store all our data in one variable. This'll take three changes to our program: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal - -url = "https://warehouse-theme-metal.myshopify.com/collections/sales" -response = httpx.get(url) -response.raise_for_status() - -html_code = response.text -soup = BeautifulSoup(html_code, "html.parser") - -# highlight-next-line -data = [] -for product in soup.select(".product-item"): - title = product.select_one(".product-item__title").text.strip() - - price_text = ( - product - .select_one(".price") - .contents[-1] - .strip() - .replace("$", "") - .replace(",", "") - ) - if price_text.startswith("From "): - min_price = Decimal(price_text.removeprefix("From ")) - price = None - else: - min_price = Decimal(price_text) - price = min_price - - # highlight-next-line - data.append({"title": title, "min_price": min_price, "price": price}) - -# highlight-next-line -print(data) +```js +import * as cheerio from 'cheerio'; + +const url = "https://warehouse-theme-metal.myshopify.com/collections/sales"; +const response = await fetch(url); + +if (response.ok) { + const html = await response.text(); + const $ = cheerio.load(html); + + // highlight-next-line + const data = []; + $(".product-item").each((i, element) => { + const productItem = $(element); + + const title = productItem.find(".product-item__title"); + const titleText = title.text().trim(); + + const price = productItem.find(".price").contents().last(); + const priceRange = { minPrice: null, price: null }; + const priceText = price + .text() + .trim() + .replace("$", "") + .replace(".", "") + .replace(",", ""); + + if (priceText.startsWith("From ")) { + priceRange.minPrice = parseInt(priceText.replace("From ", "")); + } else { + priceRange.minPrice = parseInt(priceText); + priceRange.price = priceRange.minPrice; + } + + // highlight-next-line + data.push({ title: titleText, ...priceRange }) + }); + + // highlight-next-line + console.log(data); +} else { + throw new Error(`HTTP ${response.status}`); +} ``` -Before looping over the products, we prepare an empty list. Then, instead of printing each line, we append the data of each product to the list in the form of a Python dictionary. At the end of the program, we print the entire list at once. +Before looping over the products, we prepare an empty array. Then, instead of printing each line, we append the data of each product to the array in the form of a JavaScript object. At the end of the program, we print the entire array at once. ```text -$ python main.py -[{'title': 'JBL Flip 4 Waterproof Portable Bluetooth Speaker', 'min_price': Decimal('74.95'), 'price': Decimal('74.95')}, {'title': 'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV', 'min_price': Decimal('1398.00'), 'price': None}, ...] +$ node index.js +[ + { + title: 'JBL Flip 4 Waterproof Portable Bluetooth Speaker', + minPrice: 7495, + price: 7495 + }, + { + title: 'Sony XBR-950G BRAVIA 4K HDR Ultra HD TV', + minPrice: 139800, + price: null + }, + ... +] ``` -:::tip Pretty print +:::tip Spread syntax + +The three dots in `{ title: titleText, ...priceRange }` are called [spread syntax](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Spread_syntax). It's the same as if we wrote the following: -If you find the complex data structures printed by `print()` difficult to read, try using [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp) from the `pprint` module instead. +```js +{ + title: titleText, + minPrice: priceRange.minPrice, + price: priceRange.price, +} +``` ::: -## Saving data as CSV +## Saving data as JSON -The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets. +The JSON format is popular primarily among developers. We use it for storing data, configuration files, or as a way to transfer data between programs (e.g., APIs). Its origin stems from the syntax of JavaScript objects, but people now use it accross programming languages. -In Python, it's convenient to read and write CSV files, thanks to the [`csv`](https://docs.python.org/3/library/csv.html) standard library module. First let's try something small in the Python's interactive REPL to familiarize ourselves with the basic usage: +We'll begin with importing the `writeFile` function from the Node.js standard library, so that we can, well, write files: -```py ->>> import csv ->>> with open("data.csv", "w") as file: -... writer = csv.DictWriter(file, fieldnames=["name", "age", "hobbies"]) -... writer.writeheader() -... writer.writerow({"name": "Alice", "age": 24, "hobbies": "kickbox, Python"}) -... writer.writerow({"name": "Bob", "age": 42, "hobbies": "reading, TypeScript"}) -... +```js +import * as cheerio from 'cheerio'; +// highlight-next-line +import { writeFile } from "fs/promises"; ``` -We first opened a new file for writing and created a `DictWriter()` instance with the expected field names. We instructed it to write the header row first and then added two more rows containing actual data. The code produced a `data.csv` file in the same directory where we're running the REPL. It has the following contents: +Next, instead of printing the data, we'll finish the program by exporting it to JSON. Let's replace the line `console.log(data)` with the following: -```csv title=data.csv -name,age,hobbies -Alice,24,"kickbox, Python" -Bob,42,"reading, TypeScript" +```js +const jsonData = JSON.stringify(data); +await writeFile('products.json', jsonData); ``` -In the CSV format, if values contain commas, we should enclose them in quotes. You can see that the writer automatically handled this. - -When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it as well. If you're using a different operating system, try opening the file with any spreadsheet program you have. - -![CSV example preview](images/csv-example.png) - -Now that's nice, but we didn't want Alice, Bob, kickbox, or TypeScript. What we actually want is a CSV containing `Sony XBR-950G BRAVIA 4K HDR Ultra HD TV`, right? Let's do this! First, let's add `csv` to our imports: +That's it! If we run our scraper now, it won't display any output, but it will create a `products.json` file in the current working directory, which contains all the data about the listed products: -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -# highlight-next-line -import csv + +```json title=products.json +[{"title":"JBL Flip 4 Waterproof Portable Bluetooth Speaker","minPrice":7495,"price":7495},{"title":"Sony XBR-950G BRAVIA 4K HDR Ultra HD TV","minPrice":139800,"price":null},...] ``` -Next, instead of printing the data, we'll finish the program by exporting it to CSV. Replace `print(data)` with the following: +If you skim through the data, you'll notice that the `JSON.stringify()` function handled some potential issues, such as escaping double quotes found in one of the titles by adding a backslash: -```py -with open("products.csv", "w") as file: - writer = csv.DictWriter(file, fieldnames=["title", "min_price", "price"]) - writer.writeheader() - for row in data: - writer.writerow(row) +```json +{"title":"Sony SACS9 10\" Active Subwoofer","minPrice":15800,"price":15800} ``` -If we run our scraper now, it won't display any output, but it will create a `products.csv` file in the current working directory, which contains all the data about the listed products. - -![CSV preview](images/csv.png) - -## Saving data as JSON - -The JSON format is popular primarily among developers. We use it for storing data, configuration files, or as a way to transfer data between programs (e.g., APIs). Its origin stems from the syntax of objects in the JavaScript programming language, which is similar to the syntax of Python dictionaries. +:::tip Pretty JSON -In Python, there's a [`json`](https://docs.python.org/3/library/json.html) standard library module, which is so straightforward that we can start using it in our code right away. We'll need to begin with imports: +While a compact JSON file without any whitespace is efficient for computers, it can be difficult for humans to read. You can call `JSON.stringify(data, null, 2)` for prettier output. See [documentation](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify) for explanation of the parameters and more examples. -```py -import httpx -from bs4 import BeautifulSoup -from decimal import Decimal -import csv -# highlight-next-line -import json -``` +::: -Next, let’s append one more export to end of the source code of our scraper: +## Saving data as CSV -```py -with open("products.json", "w") as file: - json.dump(data, file) -``` +The CSV format is popular among data analysts because a wide range of tools can import it, including spreadsheets apps like LibreOffice Calc, Microsoft Excel, Apple Numbers, and Google Sheets. -That’s it! If we run the program now, it should also create a `products.json` file in the current working directory: +Neither JavaScript itself nor Node.js offers anything built-in to read and write CSV, so we'll need to install a library. We'll use [json2csv](https://juanjodiaz.github.io/json2csv/), a _de facto_ standard for working with CSV in JavaScript: ```text -$ python main.py -Traceback (most recent call last): - ... - raise TypeError(f'Object of type {o.__class__.__name__} ' -TypeError: Object of type Decimal is not JSON serializable -``` - -Ouch! JSON supports integers and floating-point numbers, but there's no guidance on how to handle `Decimal`. To maintain precision, it's common to store monetary values as strings in JSON files. But this is a convention, not a standard, so we need to handle it manually. We'll pass a custom function to `json.dump()` to serialize objects that it can't handle directly: +$ npm install @json2csv/node --save -```py -def serialize(obj): - if isinstance(obj, Decimal): - return str(obj) - raise TypeError("Object not JSON serializable") - -with open("products.json", "w") as file: - json.dump(data, file, default=serialize) +added 4 packages, and audited 28 packages in 1s +... ``` -Now the program should work as expected, producing a JSON file with the following content: +Once installed, we can add the following line to our imports: - -```json title=products.json -[{"title": "JBL Flip 4 Waterproof Portable Bluetooth Speaker", "min_price": "74.95", "price": "74.95"}, {"title": "Sony XBR-950G BRAVIA 4K HDR Ultra HD TV", "min_price": "1398.00", "price": null}, ...] +```js +import * as cheerio from 'cheerio'; +import { writeFile } from "fs/promises"; +// highlight-next-line +import { AsyncParser } from '@json2csv/node'; ``` -If you skim through the data, you'll notice that the `json.dump()` function handled some potential issues, such as escaping double quotes found in one of the titles by adding a backslash: +Then, let's add one more data export near the end of the source code of our scraper: -```json -{"title": "Sony SACS9 10\" Active Subwoofer", "min_price": "158.00", "price": "158.00"} +```js +const parser = new AsyncParser(); +const csvData = await parser.parse(data).promise(); +await writeFile("products.csv", csvData); ``` -:::tip Pretty JSON +The program should now also produce a `data.csv` file. When browsing the directory on macOS, we can see a nice preview of the file's contents, which proves that the file is correct and that other programs can read it. If you're using a different operating system, try opening the file with any spreadsheet program you have. -While a compact JSON file without any whitespace is efficient for computers, it can be difficult for humans to read. You can pass `indent=2` to `json.dump()` for prettier output. +![CSV preview](images/csv.png) -Also, if your data contains non-English characters, set `ensure_ascii=False`. By default, Python encodes everything except [ASCII](https://en.wikipedia.org/wiki/ASCII), which means it would save [Bún bò Nam Bô](https://vi.wikipedia.org/wiki/B%C3%BAn_b%C3%B2_Nam_B%E1%BB%99) as `B\\u00fan b\\u00f2 Nam B\\u00f4`. +In the CSV format, if a value contains commas, we should enclose it in quotes. If it contains quotes, we should double them. When we open the file in a text editor of our choice, we can see that the library automatically handled this: -::: +```csv title=data.csv +"title","minPrice","price" +"JBL Flip 4 Waterproof Portable Bluetooth Speaker",7495,7495 +"Sony XBR-950G BRAVIA 4K HDR Ultra HD TV",139800, +"Sony SACS9 10"" Active Subwoofer",15800,15800 +... +"Samsung Surround Sound Bar Home Speaker, Set of 7 (HW-NW700/ZA)",64799,64799 +... +``` -We've built a Python application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages. +We've built a Node.js application that downloads a product listing, parses the data, and saves it in a structured format for further use. But the data still has gaps: for some products, we only have the min price, not the actual prices. In the next lesson, we'll attempt to scrape more details from all the product pages. --- ## Exercises -In this lesson, you learned how to create export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them. +In this lesson, we created export files in two formats. The following challenges are designed to help you empathize with the people who'd be working with them. -### Process your CSV +### Process your JSON -Open the `products.csv` file in a spreadsheet app. Use the app to find all products with a min price greater than $500. +Write a new Node.js program that reads `products.json`, finds all products with a min price greater than $500, and prints each of them.
Solution - Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account: - - 1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data. - 2. Select the header row. Go to **Data > Create filter**. - 3. Use the filter icon that appears next to `min_price`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data. + ```js + import { readFile } from "fs/promises"; - ![CSV in Google Sheets](images/csv-sheets.png) + const jsonData = await readFile("products.json"); + const data = JSON.parse(jsonData); + data + .filter(row => row.minPrice > 50000) + .forEach(row => console.log(row)); + ```
-### Process your JSON +### Process your CSV -Write a new Python program that reads `products.json`, finds all products with a min price greater than $500, and prints each one using [`pp()`](https://docs.python.org/3/library/pprint.html#pprint.pp). +Open the `products.csv` file we created in the lesson using a spreadsheet application. Then, in the app, find all products with a min price greater than $500.
Solution - ```py - import json - from pprint import pp - from decimal import Decimal + Let's use [Google Sheets](https://www.google.com/sheets/about/), which is free to use. After logging in with a Google account: - with open("products.json", "r") as file: - products = json.load(file) + 1. Go to **File > Import**, choose **Upload**, and select the file. Import the data using the default settings. You should see a table with all the data. + 2. Select the header row. Go to **Data > Create filter**. + 3. Use the filter icon that appears next to `minPrice`. Choose **Filter by condition**, select **Greater than**, and enter **500** in the text field. Confirm the dialog. You should see only the filtered data. - for product in products: - if Decimal(product["min_price"]) > 500: - pp(product) - ``` + ![CSV in Google Sheets](images/csv-sheets.png)
From 3b3a626da26e4754014c6a76e461b90844cdb680 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 10:15:04 +0200 Subject: [PATCH 4/5] fix: missing semicolon in saving data --- .../webscraping/scraping_basics_javascript2/08_saving_data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index 72a6e140a8..bfbb383697 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -62,7 +62,7 @@ if (response.ok) { } // highlight-next-line - data.push({ title: titleText, ...priceRange }) + data.push({ title: titleText, ...priceRange }); }); // highlight-next-line From 38f6fc2889091432ffd288965049a77ed00a0f4c Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Mon, 30 Jun 2025 10:30:06 +0200 Subject: [PATCH 5/5] fix: make it clearer in saving data that we append more code --- .../webscraping/scraping_basics_javascript2/08_saving_data.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md index bfbb383697..332567b92a 100644 --- a/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md +++ b/sources/academy/webscraping/scraping_basics_javascript2/08_saving_data.md @@ -168,6 +168,9 @@ import { AsyncParser } from '@json2csv/node'; Then, let's add one more data export near the end of the source code of our scraper: ```js +const jsonData = JSON.stringify(data); +await writeFile('products.json', jsonData); + const parser = new AsyncParser(); const csvData = await parser.parse(data).promise(); await writeFile("products.csv", csvData);