Vangelis Katsikaros

Question 15

Oops I get a IndexError: list index out of range and I think you forgot the CSV printing code?

import requests
from bs4 import BeautifulSoup
import csv
import io

url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    div = soup.find('div', id='_idItemTableForP')
    if div is not None:
        table = div.find('table')

        if table is not None:

            rows = []
            tbody = table.find('tbody')
            if tbody is not None:

                for tr in tbody.find_all('tr'):
                    row_data = []
                    for td in tr.find_all('td'):
                        # Find the second <a> tag within the <td>
                        a_tags = td.find_all('a')
                        if len(a_tags) >= 2:
                            second_a_tag = a_tags[1]
                            code_span = second_a_tag.find('span', class_='pspItemCateAndNo')
                            if code_span is not None:
                                code = code_span.text.strip().split(':')[-1].strip()
                                row_data.append(code)
                        else:
                            row_data.append('')
                    rows.append(row_data)
            else:
                print("Tbody not found in the table.")

            for row in rows:
                print("Code:", row[0])
            
            # Write extracted codes to a CSV file
            with open('extracted_codes.csv', 'w', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                for row in rows:
                    csv_writer.writerow(row)
                    
            print("CSV file 'extracted_codes.csv' has been created with the extracted codes.")
            
        else:
            print("Table not found within the div. Check the structure.")
    else:
        print("Div with id '_idItemTableForP' not found. Check the id.")
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

The diff:

if response.status_code == 200:
 
             for row in rows:
                 print("Code:", row[0])
+
+            # Write extracted codes to a CSV file
+            with open('extracted_codes.csv', 'w', newline='') as csvfile:
+                csv_writer = csv.writer(csvfile)
+                for row in rows:
+                    csv_writer.writerow(row)
+
+            print("CSV file 'extracted_codes.csv' has been created with the extracted codes.")
+
         else:
             print("Table not found within the div. Check the structure.")
     else:

At this stage I believe the chat with the LLM took the wrong turn… It added CSV code, but it’s not using a file as the output, not stdout.

⇦ question 14 Index question 16 ⇨