Question 7
Nice, I get results now. An example row looks like this <td rowspan="2" width="300px" class="pspItemClick"><a href="/v2/catalog/catalogitem.page?P=3005&name=Brick%201%20x%201&category=%5BBrick%5D" class="pspItemNameLink">Brick 1 x 1</a><br><span class="pspItemCateAndNo"><span class="blcatList"><a class="_blcatLink" onclick="" href="//www.bricklink.com/catalogList.asp?catType=P&catString=5">Brick</a></span> : 3005</span><span class="pspPCC"></span></td>
I would like to isolate the url and the text of the url. Is this possible?
import requests
from bs4 import BeautifulSoup
url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', id='_idItemTableForP')
if div is not None:
table = div.find('table')
if table is not None:
headers = []
thead = table.find('thead')
if thead is not None:
for th in thead.find_all('th'):
headers.append(th.text.strip())
else:
print("Thead not found in the table.")
rows = []
tbody = table.find('tbody')
if tbody is not None:
for tr in tbody.find_all('tr'):
row_data = []
for td in tr.find_all('td'):
# Find the <a> tag within the <td>
a_tag = td.find('a')
if a_tag is not None:
url = a_tag['href']
text = a_tag.text.strip()
row_data.append((url, text))
else:
row_data.append((None, td.text.strip()))
rows.append(row_data)
else:
print("Tbody not found in the table.")
if headers:
print("Headers:", headers)
else:
print("No headers found.")
if rows:
for row in rows:
for cell in row:
url, text = cell
if url:
print(f"URL: {url}, Text: {text}")
else:
print(f"Text: {text}")
else:
print("No rows found.")
else:
print("Table not found within the div. Check the structure.")
else:
print("Div with id '_idItemTableForP' not found. Check the id.")
else:
print('Failed to retrieve the webpage. Status code:', response.status_code)
The diff:
if response.status_code == 200:
tbody = table.find('tbody')
if tbody is not None:
for tr in tbody.find_all('tr'):
- cells = [td.text.strip() for td in tr.find_all('td')]
- rows.append(cells)
+ row_data = []
+ for td in tr.find_all('td'):
+ # Find the <a> tag within the <td>
+ a_tag = td.find('a')
+ if a_tag is not None:
+ url = a_tag['href']
+ text = a_tag.text.strip()
+ row_data.append((url, text))
+ else:
+ row_data.append((None, td.text.strip()))
+ rows.append(row_data)
else:
print("Tbody not found in the table.")
@@ -35,7 +44,12 @@ if response.status_code == 200:
if rows:
for row in rows:
- print("Row:", row)
+ for cell in row:
+ url, text = cell
+ if url:
+ print(f"URL: {url}, Text: {text}")
+ else:
+ print(f"Text: {text}")
else:
print("No rows found.")
else:
Output
Thead not found in the table.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-8-e5893752d706> in <cell line: 7>()
29 a_tag = td.find('a')
30 if a_tag is not None:
---> 31 url = a_tag['href']
32 text = a_tag.text.strip()
33 row_data.append((url, text))
/usr/local/lib/python3.10/dist-packages/bs4/element.py in __getitem__(self, key)
1571 """tag[key] returns the value of the 'key' attribute for the Tag,
1572 and throws an exception if it's not there."""
-> 1573 return self.attrs[key]
1574
1575 def __iter__(self):
KeyError: 'href'
⇦ question 6 | Index | question 8 ⇨ |