Jumpstart Your CDx Development: Creating an FDA-Approved CDx Database in Just Five Minutes
If you're a CDx lead and you are gearing up to develop a new CDx, it's crucial to start by exploring FDA-approved CDx devices. You can find a comprehensive list at FDA's Official Website, https://www.fda.gov/medical-devices/in-vitro-diagnostics/list-cleared-or-approved-companion-diagnostic-devices-in-vitro-and-imaging-tools. However, navigating this list can be daunting due to the absence of a search interface, making it a time-intensive task to pinpoint the information you need.
In larger organizations, the usual route is to schedule a meeting with the data science support team, request a web crawl, and await the creation of a local database. But how long does this process take? Weeks? Months?
Here's a thought: Why not do it yourself? Here is the Python code.
from bs4 import BeautifulSoup
import requests
import os
with open('List of Cleared or Approved Companion Diagnostic Devices (In Vitro and Imaging Tools) _ FDA.html','r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
fda_urls = []
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith("https://www.accessdata.fda.gov/scripts/cdrh"):
fda_urls.append(href)
# Print the extracted URLs
for url in fda_urls:
print(url)
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
def scrape_fda_data(url):
# Send a GET request to the URL
# Initialize a session object
session = requests.Session()
# Add User-Agent header to the session
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
# Send a GET request to the URL with the session
response = session.get(url, headers=headers)
# Check if the request was successful
if response.status_code != 200:
return f"Failed to retrieve webpage. Status code: {response.status_code}"
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Variables to extract (excluding "Approval Order Statement" for now)
variables = [
"Device",
"Generic Name",
"Applicant",
"PMA Number",
"Date Received",
"Decision Date",
"Product Code",
"Docket Number",
"Notice Date",
"Advisory Committee",
"Clinical Trials",
"Expedited Review Granted?",
"Combination Product"
]
# Extract data for each variable
data = {}
for variable in variables:
row = soup.find('th', text=variable)
if row and row.next_sibling:
data[variable] = row.next_sibling.get_text(strip=True)
else:
data[variable] = None
# Special handling for "Approval Order Statement"
approval_order_statement = soup.find('span', text='Approval Order Statement')
if approval_order_statement and approval_order_statement.parent:
data['Approval Order Statement'] = approval_order_statement.parent.get_text(strip=True).replace('Approval Order Statement', '', 1).strip()
else:
data['Approval Order Statement'] = None
return data
# Initialize an empty DataFrame
df_all = pd.DataFrame()
# Loop through each URL and scrape data
for url in fda_urls:
print(url)
time.sleep(30)
data = scrape_fda_data(url)
df_all = df_all.append(data, ignore_index=True)
print(df_all)
download_directory = 'your folder'
filename = "PMA_list.xlsx"
df_all.to_excel(os.path.join(download_directory, filename))
𝑷𝒍𝒆𝒂𝒔𝒆 𝒃𝒆 𝒂𝒘𝒂𝒓𝒆 𝒕𝒉𝒂𝒕 𝒊𝒕'𝒔 𝒏𝒆𝒄𝒆𝒔𝒔𝒂𝒓𝒚 𝒕𝒐 𝒂𝒅𝒉𝒆𝒓𝒆 𝒕𝒐 𝒕𝒉𝒆 𝑭𝑫𝑨'𝒔 𝒑𝒐𝒍𝒊𝒄𝒚 𝒓𝒆𝒈𝒂𝒓𝒅𝒊𝒏𝒈 𝒘𝒆𝒃 𝒄𝒓𝒂𝒘𝒍𝒊𝒏𝒈. 𝑻𝒐 𝒂𝒗𝒐𝒊𝒅 𝒃𝒆𝒊𝒏𝒈 𝒃𝒍𝒐𝒄𝒌𝒆𝒅 𝒃𝒚 𝒕𝒉𝒆 𝑭𝑫𝑨 𝒘𝒆𝒃𝒔𝒊𝒕𝒆, 𝑰 𝒉𝒂𝒗𝒆 𝒊𝒏𝒄𝒍𝒖𝒅𝒆𝒅 𝒕𝒉𝒆 𝒕𝒊𝒎𝒆.𝒔𝒍𝒆𝒆𝒑(30) 𝒄𝒐𝒎𝒎𝒂𝒏𝒅 𝒊𝒏 𝒕𝒉𝒆 𝒄𝒐𝒅𝒆, 𝒘𝒉𝒊𝒄𝒉 𝒊𝒏𝒕𝒓𝒐𝒅𝒖𝒄𝒆𝒔 𝒂 30-𝒔𝒆𝒄𝒐𝒏𝒅 𝒅𝒆𝒍𝒂𝒚 𝒃𝒆𝒕𝒘𝒆𝒆𝒏 𝒓𝒆𝒒𝒖𝒆𝒔𝒕𝒔.
That's all it takes to retrieve information on all FDA-approved CDx devices. Simple, isn't it?
Global Head of Biomarker Statistics at Sanofi
1yThanks for sharing. Very inspiring!
Automator/ Medical writer/ PV scientist @ Small Biotech Companies | Mentor medical writers on AI | Synthesizer of nonclinical, PK, PV, and RWE in IND, DSUR, process, SOP | Project/ part time
1yThank you so much. Wrestling with something similar