Vangelis Katsikaros

Question 13

Oh! I just noticed I want to process only every second row. How do I do this?


import requests
from bs4 import BeautifulSoup
import csv
import io

url = ''
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    div = soup.find('div', id='_idItemTableForP')
    if div is not None:
        table = div.find('table')

        if table is not None:

            rows = []
            tbody = table.find('tbody')
            if tbody is not None:
                row_index = 0
                for tr in tbody.find_all('tr'):
                    if row_index % 2 == 1:
                        row_data = []
                        for td in tr.find_all('td'):
                            # Find the <a> tag within the <td>
                            a_tag = td.find('a')
                            if a_tag is not None:
                                url = a_tag.get('href', None)  # Use get to avoid KeyError
                                text = a_tag.text.strip()
                    row_index += 1
                print("Tbody not found in the table.")

            output = io.StringIO()
            csv_writer = csv.writer(output)
            for row in rows:

            csv_content = output.getvalue()
            print("Table not found within the div. Check the structure.")
        print("Div with id '_idItemTableForP' not found. Check the id.")
    print('Failed to retrieve the webpage. Status code:', response.status_code)

The diff:

if response.status_code == 200:
             rows = []
             tbody = table.find('tbody')
             if tbody is not None:
+                row_index = 0
                 for tr in tbody.find_all('tr'):
-                    row_data = []
-                    for td in tr.find_all('td'):
-                        # Find the <a> tag within the <td>
-                        a_tag = td.find('a')
-                        if a_tag is not None:
-                            url = a_tag.get('href', None)  # Use get to avoid KeyError
-                            text = a_tag.text.strip()
-                            row_data.append(url)
-                            row_data.append(text)
-                        else:
-                            row_data.append(td.text.strip())
-                    rows.append(row_data[3:5])
+                    if row_index % 2 == 1:
+                        row_data = []
+                        for td in tr.find_all('td'):
+                            # Find the <a> tag within the <td>
+                            a_tag = td.find('a')
+                            if a_tag is not None:
+                                url = a_tag.get('href', None)  # Use get to avoid KeyError
+                                text = a_tag.text.strip()
+                                row_data.append(url)
+                                row_data.append(text)
+                            else:
+                                row_data.append(td.text.strip())
+                        rows.append(row_data[3:5])
+                    row_index += 1
                 print("Tbody not found in the table.")


/v2/catalog/,Brick 1 x 1
/v2/catalog/,Brick 1 x 2
/v2/catalog/,Brick 1 x 3
/v2/catalog/,Brick 1 x 4
/v2/catalog/,Brick 1 x 6
/v2/catalog/,Brick 1 x 8
/v2/catalog/,Brick 2 x 2
/v2/catalog/,Brick 2 x 3
/v2/catalog/,Brick 2 x 4
/v2/catalog/,Brick 2 x 6
/v2/catalog/,Brick 2 x 8
/v2/catalog/,Brick 4 x 6
/v2/catalog/,Brick 8 x 8
/v2/catalog/,Brick 1 x 10
/v2/catalog/,Brick 1 x 12
/v2/catalog/,Brick 1 x 16

Oh, that looks much better!

⇦ question 12 Index question 14 ⇨