Question 11
Where, by accident, I ask the LLM gibberish “I found that if I replace this” and it replies with a corrected version of the code
import requests
from bs4 import BeautifulSoup
import csv
import io
url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', id='_idItemTableForP')
if div is not None:
table = div.find('table')
if table is not None:
headers = []
thead = table.find('thead')
if thead is not None:
for th in thead.find_all('th'):
headers.append(th.text.strip())
else:
print("Thead not found in the table.")
rows = []
tbody = table.find('tbody')
if tbody is not None:
for tr in tbody.find_all('tr'):
row_data = []
for td in tr.find_all('td'):
# Find the <a> tag within the <td>
a_tag = td.find('a')
if a_tag is not None:
url = a_tag.get('href', None) # Use get to avoid KeyError
text = a_tag.text.strip()
row_data.append(url)
row_data.append(text)
else:
row_data.append(td.text.strip())
rows.append(row_data)
else:
print("Tbody not found in the table.")
output = io.StringIO()
csv_writer = csv.writer(output)
if headers:
csv_writer.writerow(headers)
if rows:
for row in rows:
csv_writer.writerow(row)
csv_content = output.getvalue()
print(csv_content)
else:
print("Table not found within the div. Check the structure.")
else:
print("Div with id '_idItemTableForP' not found. Check the id.")
else:
print('Failed to retrieve the webpage. Status code:', response.status_code)
The diff:
if a_tag is not None:
url = a_tag.get('href', None) # Use get to avoid KeyError
text = a_tag.text.strip()
- row_data.append(f"{url}, {text}")
+ row_data.append(url)
+ row_data.append(text)
else:
row_data.append(td.text.strip())
rows.append(row_data)
Output
Thead not found in the table.
,,,/v2/catalog/catalogitem.page?P=3005&name=Brick%201%20x%201&category=%5BBrick%5D,Brick 1 x 1,New,"15,730,337","6,663",EUR 0.0001+
Used,"4,921,039","87,937",EUR 0.0013+
,,,/v2/catalog/catalogitem.page?P=3004&name=Brick%201%20x%202&category=%5BBrick%5D,Brick 1 x 2,New,"16,800,431","6,779",EUR 0.0001+
Used,"7,076,722","101,667",EUR 0.0009+
,,,/v2/catalog/catalogitem.page?P=3622&name=Brick%201%20x%203&category=%5BBrick%5D,Brick 1 x 3,New,"3,867,766","5,893",EUR 0.0001+
Used,"1,311,312","47,376",EUR 0.001+
,,,/v2/catalog/catalogitem.page?P=3010&name=Brick%201%20x%204&category=%5BBrick%5D,Brick 1 x 4,New,"5,709,796","6,332",EUR 0.0001+
Used,"1,851,694","68,725",EUR 0.0027+
,,,/v2/catalog/catalogitem.page?P=3009&name=Brick%201%20x%206&category=%5BBrick%5D,Brick 1 x 6,New,"2,288,682","5,834",EUR 0.0001+
Used,"829,247","45,917",EUR 0.0013+
,,,/v2/catalog/catalogitem.page?P=3008&name=Brick%201%20x%208&category=%5BBrick%5D,Brick 1 x 8,New,"1,101,907","5,121",EUR 0.0001+
Used,"350,629","27,114",EUR 0.005+
,,,/v2/catalog/catalogitem.page?P=3003&name=Brick%202%20x%202&category=%5BBrick%5D,Brick 2 x 2,New,"6,762,462","6,463",EUR 0.0001+
Used,"2,931,055","78,657",EUR 0.0043+s
...
Nice! However, it seems we get a lot of empty elements, ie ,,,
. It also seems that the position of the elements we want is consistent.
⇦ question 10 | Index | question 12 ⇨ |