Question 14
That’s nice. In each row there is a second element that has a code in a element. Can you extract it? Example row
<td rowspan="2" width="300px" class="pspItemClick"><a href="/v2/catalog/catalogitem.page?P=3005&name=Brick%201%20x%201&category=%5BBrick%5D" class="pspItemNameLink">Brick 1 x 1</a><br><span class="pspItemCateAndNo"><span class="blcatList"><a class="_blcatLink" onclick="" href="//www.bricklink.com/catalogList.asp?catType=P&catString=5">Brick</a></span> : 3005</span><span class="pspPCC"></span></td>
import requests
from bs4 import BeautifulSoup
import csv
import io
url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', id='_idItemTableForP')
if div is not None:
table = div.find('table')
if table is not None:
rows = []
tbody = table.find('tbody')
if tbody is not None:
for tr in tbody.find_all('tr'):
row_data = []
for td in tr.find_all('td'):
# Find the second <a> tag within the <td>
a_tags = td.find_all('a')
if len(a_tags) >= 2:
second_a_tag = a_tags[1]
code_span = second_a_tag.find('span', class_='pspItemCateAndNo')
if code_span is not None:
code = code_span.text.strip().split(':')[-1].strip()
row_data.append(code)
else:
row_data.append('')
rows.append(row_data)
else:
print("Tbody not found in the table.")
for row in rows:
print("Code:", row[0])
else:
print("Table not found within the div. Check the structure.")
else:
print("Div with id '_idItemTableForP' not found. Check the id.")
else:
print('Failed to retrieve the webpage. Status code:', response.status_code)
The diff. For no reason the modulo %
related code is now removed. Also, for no reason, the CVS printing code is also removed:
if response.status_code == 200:
rows = []
tbody = table.find('tbody')
if tbody is not None:
- row_index = 0
+
for tr in tbody.find_all('tr'):
- if row_index % 2 == 0:
- row_data = []
- for td in tr.find_all('td'):
- # Find the <a> tag within the <td>
- a_tag = td.find('a')
- if a_tag is not None:
- url = a_tag.get('href', None) # Use get to avoid KeyError
- text = a_tag.text.strip()
- row_data.append(url)
- row_data.append(text)
- else:
- row_data.append(td.text.strip())
- rows.append(row_data[3:5])
- row_index += 1
+ row_data = []
+ for td in tr.find_all('td'):
+ # Find the second <a> tag within the <td>
+ a_tags = td.find_all('a')
+ if len(a_tags) >= 2:
+ second_a_tag = a_tags[1]
+ code_span = second_a_tag.find('span', class_='pspItemCateAndNo')
+ if code_span is not None:
+ code = code_span.text.strip().split(':')[-1].strip()
+ row_data.append(code)
+ else:
+ row_data.append('')
+ rows.append(row_data)
else:
print("Tbody not found in the table.")
- output = io.StringIO()
- csv_writer = csv.writer(output)
-
for row in rows:
- csv_writer.writerow(row)
-
- csv_content = output.getvalue()
- print(csv_content)
+ print("Code:", row[0])
else:
print("Table not found within the div. Check the structure.")
Output
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-15-a16af276881b> in <cell line: 13>()
49 # Print the extracted codes
50 for row in rows:
---> 51 print("Code:", row[0])
52 else:
53 print("Table not found within the div. Check the structure.")
IndexError: list index out of range
...
⇦ question 13 | Index | question 15 ⇨ |