Parse XML file using selenium and bs4?

i try to parse a xml-file using the following code:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = Options()
# options.add_argument('--headless=new')
options.add_argument("start-maximized")
options.add_argument('--log-level=3')
options.add_experimental_option("prefs", {"profile.default_content_setting_values.notifications": 1})
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument('--disable-blink-features=AutomationControlled')
srv=Service()
driver = webdriver.Chrome (service=srv, options=options)
# driver.minimize_window()
waitWD = WebDriverWait (driver, 10)
wLink = "https://projects.propublica.org/nonprofits/organizations/830370609"
driver.get(wLink)
driver.execute_script("arguments[0].click();", waitWD.until(EC.element_to_be_clickable((By.XPATH, '(//a[text()="XML"])[1]'))))
driver.switch_to.window(driver.window_handles[1])
time.sleep(3)
print(driver.current_url)
soup = BeautifulSoup (driver.page_source, 'lxml')
worker = soup.find("PhoneNum")
print(worker)
But as you can see in the result i am for exmaple not able to parse the element "PhoneNum"
(selenium) C:\DEV\Fiverr2025\TRY\austibn>python test.py
https://pp-990-xml.s3.us-east-1.amazonaws.com/202403189349311780_public.xml?response-content-disposition=inline&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIA266MJEJYTM5WAG5Y%2F20250423%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250423T152903Z&X-Amz-Expires=1800&X-Amz-SignedHeaders=host&X-Amz-Signature=9743a63b41a906fac65c397a2bba7208938ca5b865f1e5a33c4f711769c815a4
None
How can i parse the xml-file from this site?
Answer
Fixes:
Use
requests.get()
to fetch the XML directly (faster and more reliable than Selenium for raw XML).Parse with
BeautifulSoup(..., 'xml')
(not'lxml'
, which is for HTML).Close Selenium after getting the URL (since it's no longer needed).
Check if the tag exists before accessing
.text
.
soup.find("PhoneNum" will return first one phone number. However, I use find_all() to return all matching elements.
The full code with corrections:
import time
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
options = Options()
options.add_argument("start-maximized")
options.add_argument('--log-level=3')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_argument('--disable-blink-features=AutomationControlled')
srv = Service()
driver = webdriver.Chrome(service=srv, options=options)
waitWD = WebDriverWait(driver, 10)
url = "https://projects.propublica.org/nonprofits/organizations/830370609"
driver.get(url)
xml_button = waitWD.until(EC.element_to_be_clickable((By.XPATH, '(//a[text()="XML"])[1]')))
driver.execute_script("arguments[0].click();", xml_button)
driver.switch_to.window(driver.window_handles[1])
time.sleep(3)
xml_url = driver.current_url
driver.quit()
response = requests.get(xml_url)
if response.status_code != 200:
print("Failed to download XML")
exit()
soup = BeautifulSoup(response.content, 'xml')
phone_numbers = soup.find_all('PhoneNum')
if phone_numbers:
print(f"Found {len(phone_numbers)} phone numbers:")
for idx, phone in enumerate(phone_numbers, start=1):
print(f"{idx}. {phone.text.strip()}")
else:
print("No <PhoneNum> tags found in the XML.")
with open("propublica_data.xml", "wb") as f:
f.write(response.content)
print("XML saved to 'propublica_data.xml'")
Output:
Found 4 phone numbers:
1. 6023146022
2. 6022687502
3. 6028812483
4. 6023146022
XML saved to 'propublica_data.xml'
Enjoyed this article?
Check out more content on our blog or follow us on social media.
Browse more articles