Question 8
Hm I get KeyError: 'href'
import requests
from bs4 import BeautifulSoup
url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', id='_idItemTableForP')
if div is not None:
table = div.find('table')
if table is not None:
headers = []
thead = table.find('thead')
if thead is not None:
for th in thead.find_all('th'):
headers.append(th.text.strip())
else:
print("Thead not found in the table.")
rows = []
tbody = table.find('tbody')
if tbody is not None:
for tr in tbody.find_all('tr'):
row_data = []
for td in tr.find_all('td'):
# Find the <a> tag within the <td>
a_tag = td.find('a')
if a_tag is not None:
url = a_tag.get('href', None) # Use get to avoid KeyError
text = a_tag.text.strip()
row_data.append((url, text))
else:
row_data.append((None, td.text.strip()))
rows.append(row_data)
else:
print("Tbody not found in the table.")
if headers:
print("Headers:", headers)
else:
print("No headers found.")
if rows:
for row in rows:
for cell in row:
url, text = cell
if url:
print(f"URL: {url}, Text: {text}")
else:
print(f"Text: {text}")
else:
print("No rows found.")
else:
print("Table not found within the div. Check the structure.")
else:
print("Div with id '_idItemTableForP' not found. Check the id.")
else:
print('Failed to retrieve the webpage. Status code:', response.status_code)
The diff:
# Find the <a> tag within the <td>
a_tag = td.find('a')
if a_tag is not None:
- url = a_tag['href']
+ url = a_tag.get('href', None) # Use get to avoid KeyError
text = a_tag.text.strip()
row_data.append((url, text))
else:
Output
Thead not found in the table.
No headers found.
Text:
Text:
URL: /v2/catalog/catalogitem.page?P=3005&name=Brick%201%20x%201&category=%5BBrick%5D, Text: Brick 1 x 1
Text: New
Text: 15,730,337
Text: 6,663
Text: EUR 0.0001+
Text: Used
Text: 4,921,039
Text: 87,937
Text: EUR 0.0013+
Text:
Text:
URL: /v2/catalog/catalogitem.page?P=3004&name=Brick%201%20x%202&category=%5BBrick%5D, Text: Brick 1 x 2
Text: New
Text: 16,800,431
Text: 6,779
Text: EUR 0.0001+
Text: Used
Text: 7,076,722
Text: 101,667
Text: EUR 0.0009+
Text:
Text:
URL: /v2/catalog/catalogitem.page?P=3622&name=Brick%201%20x%203&category=%5BBrick%5D, Text: Brick 1 x 3
Text: New
Text: 3,867,766
Text: 5,893
Text: EUR 0.0001+
Text: Used
Text: 1,311,312
Text: 47,376
Text: EUR 0.001+
Text:
Text:
URL: /v2/catalog/catalogitem.page?P=3010&name=Brick%201%20x%204&category=%5BBrick%5D, Text: Brick 1 x 4
...
Nice! It’s the first time we get somethings that looks like the data we want.
⇦ question 7 | Index | question 9 ⇨ |