Question 13
Oh! I just noticed I want to process only every second row. How do I do this?
The
import requests
from bs4 import BeautifulSoup
import csv
import io
url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
div = soup.find('div', id='_idItemTableForP')
if div is not None:
table = div.find('table')
if table is not None:
rows = []
tbody = table.find('tbody')
if tbody is not None:
row_index = 0
for tr in tbody.find_all('tr'):
if row_index % 2 == 1:
row_data = []
for td in tr.find_all('td'):
# Find the <a> tag within the <td>
a_tag = td.find('a')
if a_tag is not None:
url = a_tag.get('href', None) # Use get to avoid KeyError
text = a_tag.text.strip()
row_data.append(url)
row_data.append(text)
else:
row_data.append(td.text.strip())
rows.append(row_data[3:5])
row_index += 1
else:
print("Tbody not found in the table.")
output = io.StringIO()
csv_writer = csv.writer(output)
for row in rows:
csv_writer.writerow(row)
csv_content = output.getvalue()
print(csv_content)
else:
print("Table not found within the div. Check the structure.")
else:
print("Div with id '_idItemTableForP' not found. Check the id.")
else:
print('Failed to retrieve the webpage. Status code:', response.status_code)
The diff:
if response.status_code == 200:
rows = []
tbody = table.find('tbody')
if tbody is not None:
+ row_index = 0
for tr in tbody.find_all('tr'):
- row_data = []
- for td in tr.find_all('td'):
- # Find the <a> tag within the <td>
- a_tag = td.find('a')
- if a_tag is not None:
- url = a_tag.get('href', None) # Use get to avoid KeyError
- text = a_tag.text.strip()
- row_data.append(url)
- row_data.append(text)
- else:
- row_data.append(td.text.strip())
- rows.append(row_data[3:5])
+ if row_index % 2 == 1:
+ row_data = []
+ for td in tr.find_all('td'):
+ # Find the <a> tag within the <td>
+ a_tag = td.find('a')
+ if a_tag is not None:
+ url = a_tag.get('href', None) # Use get to avoid KeyError
+ text = a_tag.text.strip()
+ row_data.append(url)
+ row_data.append(text)
+ else:
+ row_data.append(td.text.strip())
+ rows.append(row_data[3:5])
+ row_index += 1
else:
print("Tbody not found in the table.")
Output
/v2/catalog/catalogitem.page?P=3005&name=Brick%201%20x%201&category=%5BBrick%5D,Brick 1 x 1
/v2/catalog/catalogitem.page?P=3004&name=Brick%201%20x%202&category=%5BBrick%5D,Brick 1 x 2
/v2/catalog/catalogitem.page?P=3622&name=Brick%201%20x%203&category=%5BBrick%5D,Brick 1 x 3
/v2/catalog/catalogitem.page?P=3010&name=Brick%201%20x%204&category=%5BBrick%5D,Brick 1 x 4
/v2/catalog/catalogitem.page?P=3009&name=Brick%201%20x%206&category=%5BBrick%5D,Brick 1 x 6
/v2/catalog/catalogitem.page?P=3008&name=Brick%201%20x%208&category=%5BBrick%5D,Brick 1 x 8
/v2/catalog/catalogitem.page?P=3003&name=Brick%202%20x%202&category=%5BBrick%5D,Brick 2 x 2
/v2/catalog/catalogitem.page?P=3002&name=Brick%202%20x%203&category=%5BBrick%5D,Brick 2 x 3
/v2/catalog/catalogitem.page?P=3001&name=Brick%202%20x%204&category=%5BBrick%5D,Brick 2 x 4
/v2/catalog/catalogitem.page?P=2456&name=Brick%202%20x%206&category=%5BBrick%5D,Brick 2 x 6
/v2/catalog/catalogitem.page?P=3007&name=Brick%202%20x%208&category=%5BBrick%5D,Brick 2 x 8
/v2/catalog/catalogitem.page?P=2356&name=Brick%204%20x%206&category=%5BBrick%5D,Brick 4 x 6
/v2/catalog/catalogitem.page?P=4201&name=Brick%208%20x%208&category=%5BBrick%5D,Brick 8 x 8
/v2/catalog/catalogitem.page?P=6111&name=Brick%201%20x%2010&category=%5BBrick%5D,Brick 1 x 10
/v2/catalog/catalogitem.page?P=6112&name=Brick%201%20x%2012&category=%5BBrick%5D,Brick 1 x 12
/v2/catalog/catalogitem.page?P=2465&name=Brick%201%20x%2016&category=%5BBrick%5D,Brick 1 x 16
...
Oh, that looks much better!
⇦ question 12 | Index | question 14 ⇨ |