Vangelis Katsikaros

Question 12

This is better. I want only the 4th and 5th element of each row

import requests
from bs4 import BeautifulSoup
import csv
import io

url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    div = soup.find('div', id='_idItemTableForP')
    if div is not None:
        table = div.find('table')

        if table is not None:

            rows = []
            tbody = table.find('tbody')
            if tbody is not None:
                for tr in tbody.find_all('tr'):
                    row_data = []
                    for td in tr.find_all('td'):
                        # Find the <a> tag within the <td>
                        a_tag = td.find('a')
                        if a_tag is not None:
                            url = a_tag.get('href', None)  # Use get to avoid KeyError
                            text = a_tag.text.strip()
                            row_data.append(url)
                            row_data.append(text)
                        else:
                            row_data.append(td.text.strip())
                    rows.append(row_data[3:5])
            else:
                print("Tbody not found in the table.")

            output = io.StringIO()
            csv_writer = csv.writer(output)
            
            for row in rows:
                csv_writer.writerow(row)

            csv_content = output.getvalue()
            print(csv_content)
        else:
            print("Table not found within the div. Check the structure.")
    else:
        print("Div with id '_idItemTableForP' not found. Check the id.")
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

The diff.

For no reason the table header code is now removed. Won’t cause a problem, but still, without reason:

if response.status_code == 200:
         table = div.find('table')
 
         if table is not None:
-            headers = []
-            thead = table.find('thead')
-            if thead is not None:
-                for th in thead.find_all('th'):
-                    headers.append(th.text.strip())
-            else:
-                print("Thead not found in the table.")
 
             rows = []
             tbody = table.find('tbody')
@@ -36,19 +29,15 @@ if response.status_code == 200:
                             row_data.append(text)
                         else:
                             row_data.append(td.text.strip())
-                    rows.append(row_data)
+                    rows.append(row_data[3:5])
             else:
                 print("Tbody not found in the table.")
 
             output = io.StringIO()
             csv_writer = csv.writer(output)
             
-            if headers:
-                csv_writer.writerow(headers)
-                
-            if rows:
-                for row in rows:
-                    csv_writer.writerow(row)
+            for row in rows:
+                csv_writer.writerow(row)
 
             csv_content = output.getvalue()
             print(csv_content)

Output

/v2/catalog/catalogitem.page?P=3005&name=Brick%201%20x%201&category=%5BBrick%5D,Brick 1 x 1
EUR 0.0013+
/v2/catalog/catalogitem.page?P=3004&name=Brick%201%20x%202&category=%5BBrick%5D,Brick 1 x 2
EUR 0.0009+
/v2/catalog/catalogitem.page?P=3622&name=Brick%201%20x%203&category=%5BBrick%5D,Brick 1 x 3
EUR 0.001+
/v2/catalog/catalogitem.page?P=3010&name=Brick%201%20x%204&category=%5BBrick%5D,Brick 1 x 4
EUR 0.0027+
/v2/catalog/catalogitem.page?P=3009&name=Brick%201%20x%206&category=%5BBrick%5D,Brick 1 x 6
EUR 0.0013+
/v2/catalog/catalogitem.page?P=3008&name=Brick%201%20x%208&category=%5BBrick%5D,Brick 1 x 8
EUR 0.005+
/v2/catalog/catalogitem.page?P=3003&name=Brick%202%20x%202&category=%5BBrick%5D,Brick 2 x 2
EUR 0.0043+
...

Looks good! But it seems we get 2 types of rows that follow a specific pattern: one with “url, name” and a second one with “EUR price”. We are not interested in the 2nd row, we only care about the data in the 1st row.

⇦ question 11 Index question 13 ⇨