Vangelis Katsikaros

Question 9

That’s better. Is it possible to print the output in comma separated format?

import requests
from bs4 import BeautifulSoup
import csv
import io

url = 'https://vkatsikaros.github.io/dataharvest24-www.github.io/'
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    div = soup.find('div', id='_idItemTableForP')
    if div is not None:
        table = div.find('table')

        if table is not None:
            headers = []
            thead = table.find('thead')
            if thead is not None:
                for th in thead.find_all('th'):
                    headers.append(th.text.strip())
            else:
                print("Thead not found in the table.")

            rows = []
            tbody = table.find('tbody')
            if tbody is not None:
                for tr in tbody.find_all('tr'):
                    row_data = []
                    for td in tr.find_all('td'):
                        # Find the <a> tag within the <td>
                        a_tag = td.find('a')
                        if a_tag is not None:
                            url = a_tag.get('href', None)  # Use get to avoid KeyError
                            text = a_tag.text.strip()
                            row_data.append(f'"{url}", "{text}"')
                        else:
                            row_data.append(f'"{td.text.strip()}"')
                    rows.append(row_data)
            else:
                print("Tbody not found in the table.")
    
            output = io.StringIO()
            csv_writer = csv.writer(output)
            
            if headers:
                csv_writer.writerow(headers)
                
            if rows:
                for row in rows:
                    csv_writer.writerow(row)

            csv_content = output.getvalue()
            print(csv_content)
        else:
            print("Table not found within the div. Check the structure.")
    else:
        print("Div with id '_idItemTableForP' not found. Check the id.")
else:
    print('Failed to retrieve the webpage. Status code:', response.status_code)

The diff:

if response.status_code == 200:
                         if a_tag is not None:
                             url = a_tag.get('href', None)  # Use get to avoid KeyError
                             text = a_tag.text.strip()
-                            row_data.append((url, text))
+                            row_data.append(f'"{url}", "{text}"')
                         else:
-                            row_data.append((None, td.text.strip()))
+                            row_data.append(f'"{td.text.strip()}"')
                     rows.append(row_data)
             else:
                 print("Tbody not found in the table.")
     
+            output = io.StringIO()
+            csv_writer = csv.writer(output)
+            
             if headers:
-                print("Headers:", headers)
-            else:
-                print("No headers found.")
+                csv_writer.writerow(headers)
                 
             if rows:
                 for row in rows:
-                    for cell in row:
-                        url, text = cell
-                        if url:
-                            print(f"URL: {url}, Text: {text}")
-                        else:
-                            print(f"Text: {text}")
-            else:
-                print("No rows found.")
+                    csv_writer.writerow(row)
+
+            csv_content = output.getvalue()
+            print(csv_content)
         else:
             print("Table not found within the div. Check the structure.")
     else:

Output

Thead not found in the table.

"""None"", """"","""""","""/v2/catalog/catalogitem.page?P=3005&name=Brick%201%20x%201&category=%5BBrick%5D"", ""Brick 1 x 1""","""New""","""15,730,337""","""6,663""","""EUR 0.0001+"""
"""Used""","""4,921,039""","""87,937""","""EUR 0.0013+"""
"""None"", """"","""""","""/v2/catalog/catalogitem.page?P=3004&name=Brick%201%20x%202&category=%5BBrick%5D"", ""Brick 1 x 2""","""New""","""16,800,431""","""6,779""","""EUR 0.0001+"""
"""Used""","""7,076,722""","""101,667""","""EUR 0.0009+"""
"""None"", """"","""""","""/v2/catalog/catalogitem.page?P=3622&name=Brick%201%20x%203&category=%5BBrick%5D"", ""Brick 1 x 3""","""New""","""3,867,766""","""5,893""","""EUR 0.0001+"""
"""Used""","""1,311,312""","""47,376""","""EUR 0.001+"""
"""None"", """"","""""","""/v2/catalog/catalogitem.page?P=3010&name=Brick%201%20x%204&category=%5BBrick%5D"", ""Brick 1 x 4""","""New""","""5,709,796""","""6,332""","""EUR 0.0001+"""
"""Used""","""1,851,694""","""68,725""","""EUR 0.0027+"""
"""None"", """"","""""","""/v2/catalog/catalogitem.page?P=3009&name=Brick%201%20x%206&category=%5BBrick%5D"", ""Brick 1 x 6""","""New""","""2,288,682""","""5,834""","""EUR 0.0001+"""
"""Used""","""829,247""","""45,917""","""EUR 0.0013+"""

Oh, we have data! But we also have too many double quotes, ie """?

⇦ question 8 Index question 10 ⇨