Question 7

Nice, I get results now. An example row looks like this `<td rowspan="2" width="300px" class="pspItemClick"><a href="/v2/catalog/catalogitem.page?P=3005&name=Brick%201%20x%201&category=%5BBrick%5D" class="pspItemNameLink">Brick 1 x 1</a> <a class="_blcatLink" onclick="" href="//www.bricklink.com/catalogList.asp?catType=P&catString=5">Brick</a> : 3005</td>` I would like to isolate the url and the text of the url. Is this possible?

import requests
from bs4 import BeautifulSoup

url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    div = soup.find('div', id='_idItemTableForP')
    if div is not None:
        table = div.find('table')

        if table is not None:
            headers = []
            thead = table.find('thead')
            if thead is not None:
                for th in thead.find_all('th'):
                    headers.append(th.text.strip())
            else:
                print("Thead not found in the table.")

            rows = []
            tbody = table.find('tbody')
            if tbody is not None:
                for tr in tbody.find_all('tr'):
                    row_data = []
                    for td in tr.find_all('td'):
                        # Find the <a> tag within the <td>
                        a_tag = td.find('a')
                        if a_tag is not None:
                            url = a_tag['href']
                            text = a_tag.text.strip()
                            row_data.append((url, text))
                        else:
                            row_data.append((None, td.text.strip()))
                    rows.append(row_data)
            else:
                print("Tbody not found in the table.")

            if headers:
                print("Headers:", headers)
            else:
                print("No headers found.")
                
            if rows:
                for row in rows:
                    for cell in row:
                        url, text = cell
                        if url:
                            print(f"URL: {url}, Text: {text}")
                        else:
                            print(f"Text: {text}")
            else:
                print("No rows found.")
        else:
            print("Table not found within the div. Check the structure.")
    else:
        print("Div with id '_idItemTableForP' not found. Check the id.")
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

The diff:

if response.status_code == 200:
             tbody = table.find('tbody')
             if tbody is not None:
                 for tr in tbody.find_all('tr'):
-                    cells = [td.text.strip() for td in tr.find_all('td')]
-                    rows.append(cells)
+                    row_data = []
+                    for td in tr.find_all('td'):
+                        # Find the <a> tag within the <td>
+                        a_tag = td.find('a')
+                        if a_tag is not None:
+                            url = a_tag['href']
+                            text = a_tag.text.strip()
+                            row_data.append((url, text))
+                        else:
+                            row_data.append((None, td.text.strip()))
+                    rows.append(row_data)
             else:
                 print("Tbody not found in the table.")
 
@@ -35,7 +44,12 @@ if response.status_code == 200:
                 
             if rows:
                 for row in rows:
-                    print("Row:", row)
+                    for cell in row:
+                        url, text = cell
+                        if url:
+                            print(f"URL: {url}, Text: {text}")
+                        else:
+                            print(f"Text: {text}")
             else:
                 print("No rows found.")
         else:

Output

Thead not found in the table.

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-8-e5893752d706> in <cell line: 7>()
     29                         a_tag = td.find('a')
     30                         if a_tag is not None:
---> 31                             url = a_tag['href']
     32                             text = a_tag.text.strip()
     33                             row_data.append((url, text))

/usr/local/lib/python3.10/dist-packages/bs4/element.py in __getitem__(self, key)
   1571         """tag[key] returns the value of the 'key' attribute for the Tag,
   1572         and throws an exception if it's not there."""
-> 1573         return self.attrs[key]
   1574 
   1575     def __iter__(self):

KeyError: 'href'


⇦ question 6	Index	question 8 ⇨