Question 15
Oops I get a IndexError: list index out of range
and I think you forgot the CSV printing code?
import requests
from bs4 import BeautifulSoup
import csv
import io
url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', id='_idItemTableForP')
if div is not None:
table = div.find('table')
if table is not None:
rows = []
tbody = table.find('tbody')
if tbody is not None:
for tr in tbody.find_all('tr'):
row_data = []
for td in tr.find_all('td'):
# Find the second <a> tag within the <td>
a_tags = td.find_all('a')
if len(a_tags) >= 2:
second_a_tag = a_tags[1]
code_span = second_a_tag.find('span', class_='pspItemCateAndNo')
if code_span is not None:
code = code_span.text.strip().split(':')[-1].strip()
row_data.append(code)
else:
row_data.append('')
rows.append(row_data)
else:
print("Tbody not found in the table.")
for row in rows:
print("Code:", row[0])
# Write extracted codes to a CSV file
with open('extracted_codes.csv', 'w', newline='') as csvfile:
csv_writer = csv.writer(csvfile)
for row in rows:
csv_writer.writerow(row)
print("CSV file 'extracted_codes.csv' has been created with the extracted codes.")
else:
print("Table not found within the div. Check the structure.")
else:
print("Div with id '_idItemTableForP' not found. Check the id.")
else:
print('Failed to retrieve the webpage. Status code:', response.status_code)
The diff:
if response.status_code == 200:
for row in rows:
print("Code:", row[0])
+
+ # Write extracted codes to a CSV file
+ with open('extracted_codes.csv', 'w', newline='') as csvfile:
+ csv_writer = csv.writer(csvfile)
+ for row in rows:
+ csv_writer.writerow(row)
+
+ print("CSV file 'extracted_codes.csv' has been created with the extracted codes.")
+
else:
print("Table not found within the div. Check the structure.")
else:
At this stage I believe the chat with the LLM took the wrong turn… It added CSV code, but it’s not using a file as the output, not stdout.
⇦ question 14 | Index | question 16 ⇨ |