Vangelis Katsikaros

Question 14

That’s nice. In each row there is a second element that has a code in a element. Can you extract it? Example row

<td rowspan="2" width="300px" class="pspItemClick"><a href="/v2/catalog/catalogitem.page?P=3005&amp;name=Brick%201%20x%201&amp;category=%5BBrick%5D" class="pspItemNameLink">Brick 1 x 1</a><br><span class="pspItemCateAndNo"><span class="blcatList"><a class="_blcatLink" onclick="" href="//www.bricklink.com/catalogList.asp?catType=P&amp;catString=5">Brick</a></span> : 3005</span><span class="pspPCC"></span></td>

import requests
from bs4 import BeautifulSoup
import csv
import io

url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    div = soup.find('div', id='_idItemTableForP')
    if div is not None:
        table = div.find('table')

        if table is not None:

            rows = []
            tbody = table.find('tbody')
            if tbody is not None:

                for tr in tbody.find_all('tr'):
                    row_data = []
                    for td in tr.find_all('td'):
                        # Find the second <a> tag within the <td>
                        a_tags = td.find_all('a')
                        if len(a_tags) >= 2:
                            second_a_tag = a_tags[1]
                            code_span = second_a_tag.find('span', class_='pspItemCateAndNo')
                            if code_span is not None:
                                code = code_span.text.strip().split(':')[-1].strip()
                                row_data.append(code)
                        else:
                            row_data.append('')
                    rows.append(row_data)
            else:
                print("Tbody not found in the table.")

            for row in rows:
                print("Code:", row[0])
        else:
            print("Table not found within the div. Check the structure.")
    else:
        print("Div with id '_idItemTableForP' not found. Check the id.")
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

The diff. For no reason the modulo % related code is now removed. Also, for no reason, the CVS printing code is also removed:

if response.status_code == 200:
             rows = []
             tbody = table.find('tbody')
             if tbody is not None:
-                row_index = 0
+
                 for tr in tbody.find_all('tr'):
-                    if row_index % 2 == 0:
-                        row_data = []
-                        for td in tr.find_all('td'):
-                            # Find the <a> tag within the <td>
-                            a_tag = td.find('a')
-                            if a_tag is not None:
-                                url = a_tag.get('href', None)  # Use get to avoid KeyError
-                                text = a_tag.text.strip()
-                                row_data.append(url)
-                                row_data.append(text)
-                            else:
-                                row_data.append(td.text.strip())
-                        rows.append(row_data[3:5])
-                    row_index += 1
+                    row_data = []
+                    for td in tr.find_all('td'):
+                        # Find the second <a> tag within the <td>
+                        a_tags = td.find_all('a')
+                        if len(a_tags) >= 2:
+                            second_a_tag = a_tags[1]
+                            code_span = second_a_tag.find('span', class_='pspItemCateAndNo')
+                            if code_span is not None:
+                                code = code_span.text.strip().split(':')[-1].strip()
+                                row_data.append(code)
+                        else:
+                            row_data.append('')
+                    rows.append(row_data)
             else:
                 print("Tbody not found in the table.")
 
-            output = io.StringIO()
-            csv_writer = csv.writer(output)
-            
             for row in rows:
-                csv_writer.writerow(row)
-
-            csv_content = output.getvalue()
-            print(csv_content)
+                print("Code:", row[0])
         else:
             print("Table not found within the div. Check the structure.")

Output

---------------------------------------------------------------------------

IndexError                                Traceback (most recent call last)

<ipython-input-15-a16af276881b> in <cell line: 13>()
     49             # Print the extracted codes
     50             for row in rows:
---> 51                 print("Code:", row[0])
     52         else:
     53             print("Table not found within the div. Check the structure.")

IndexError: list index out of range
...
⇦ question 13 Index question 15 ⇨