· 5 months ago · May 02, 2025, 05:20 AM
1import streamlit as st
2import requests
3import pandas as pd
4import numpy as np
5import re
6import matplotlib.pyplot as plt
7import time
8from datetime import datetime, timedelta
9from collections import defaultdict
10from bs4 import BeautifulSoup
11import plotly.graph_objects as go
12import plotly.express as px
13import networkx as nx
14# Define global variables with default values
15BASE_URL = "https://www.ecfr.gov"
16request_timeout = 240 # Default value
17skip_problematic_titles = True # Default value
18throttle_delay = 0.2 # Default value
19cache_results=True
20# Set page config
21st.set_page_config(
22page_title="Federal Regulations Word Count Analysis Tool",
23page_icon="📊",
24layout="wide",
25initial_sidebar_state="expanded"
26)
27# Functions to interact with the eCFR API
28def get_agencies(force_refresh=False):
29"""Fetch all agencies from the API with caching support"""
30cache_key = "agencies_cache"
31# Check if we have cached data and caching is enabled
32if not force_refresh and cache_results and cache_key in st.session_state:
33return st.session_state[cache_key]
34# If no cache or forced refresh, fetch from API
35url = f"{BASE_URL}/api/admin/v1/agencies.json"
36try:
37response = requests.get(url, timeout=request_timeout)
38if response.status_code == 200:
39agencies = response.json().get("agencies", [])
40# Cache the result if caching is enabled
41if cache_results:
42st.session_state[cache_key] = agencies
43return agencies
44else:
45st.error(f"Error fetching agencies: {response.status_code}")
46return []
47except requests.exceptions.Timeout:
48st.error("Request timed out when fetching agencies. Try increasing the timeout.")
49return []
50except Exception as e:
51st.error(f"Error fetching agencies: {str(e)}")
52return []
53def get_titles(force_refresh=False):
54"""Fetch summary information about all titles with caching support"""
55cache_key = "titles_cache"
56# Check if we have cached data and caching is enabled
57if not force_refresh and cache_results and cache_key in st.session_state:
58return st.session_state[cache_key]
59# If no cache or forced refresh, fetch from API
60url = f"{BASE_URL}/api/versioner/v1/titles.json"
61try:
62response = requests.get(url, timeout=request_timeout)
63if response.status_code == 200:
64titles = response.json().get("titles", [])
65# Cache the result if caching is enabled
66if cache_results:
67st.session_state[cache_key] = titles
68return titles
69else:
70st.error(f"Error fetching titles: {response.status_code}")
71return []
72except requests.exceptions.Timeout:
73st.error("Request timed out when fetching titles. Try increasing the timeout.")
74return []
75except Exception as e:
76st.error(f"Error fetching titles: {str(e)}")
77return []
78def get_title_content(title_number, date, max_retries=1, force_refresh=False):
79"""Fetch XML content for a title on a specific date with retry capability and caching"""
80cache_key = f"title_content_{title_number}_{date}"
81# Check if we have cached data and caching is enabled
82if not force_refresh and cache_results and cache_key in st.session_state:
83return st.session_state[cache_key]
84# Skip known problematic titles if option is enabled
85if skip_problematic_titles and title_number in [7, 10, 40, 42, 45]:
86st.warning(f"Skipping Title {title_number} as it's known to cause timeouts due to its size")
87return None
88url = f"{BASE_URL}/api/versioner/v1/full/{date}/title-{title_number}.xml"
89retries = 0
90while retries <= max_retries:
91try:
92response = requests.get(url, timeout=request_timeout)
93if response.status_code == 200:
94content = response.text
95# Cache the result if caching is enabled
96if cache_results:
97st.session_state[cache_key] = content
98return content
99elif response.status_code == 504:
100retries += 1
101if retries <= max_retries:
102st.warning(f"Gateway timeout (504) for Title {title_number}. Retrying ({retries}/{max_retries})...")
103time.sleep(throttle_delay * 2) # Wait longer between retries
104else:
105st.warning(f"Gateway timeout (504) for Title {title_number} after {max_retries} retries. Try increasing the timeout or skipping large titles.")
106return None
107elif response.status_code == 404:
108st.warning(f"Title {title_number} not available for date {date}")
109return None
110else:
111st.error(f"Error fetching title {title_number} content: {response.status_code}")
112return None
113except requests.exceptions.Timeout:
114retries += 1
115if retries <= max_retries:
116st.warning(f"Request timed out for Title {title_number}. Retrying ({retries}/{max_retries})...")
117time.sleep(throttle_delay * 2)
118else:
119st.warning(f"Request timed out for Title {title_number} after {max_retries} retries.")
120return None
121except Exception as e:
122st.error(f"Error fetching title {title_number} content: {str(e)}")
123return None
124def count_words_in_xml(xml_content):
125"""Count words in XML content using BeautifulSoup"""
126if not xml_content:
127return 0
128try:
129# Parse XML with BeautifulSoup
130soup = BeautifulSoup(xml_content, 'lxml-xml')
131# Extract all text
132text = soup.get_text()
133# Count words
134words = re.findall(r'\b\w+\b', text)
135return len(words)
136except Exception as e:
137st.warning(f"Error parsing XML: {str(e)}")
138# Fall back to simple regex approach
139text_without_tags = re.sub(r'<[^>]+>', ' ', xml_content)
140words = re.findall(r'\b\w+\b', text_without_tags)
141return len(words)
142def create_agency_title_mapping(agencies):
143"""Create a mapping of agencies to the titles they regulate"""
144agency_to_titles = defaultdict(set)
145for agency in agencies:
146agency_name = agency["name"]
147# Add titles directly regulated by this agency
148if "cfr_references" in agency:
149for ref in agency["cfr_references"]:
150if "title" in ref:
151agency_to_titles[agency_name].add(ref["title"])
152# Process child agencies
153if "children" in agency and agency["children"]:
154for child in agency["children"]:
155child_name = child["name"]
156if "cfr_references" in child:
157for ref in child["cfr_references"]:
158if "title" in ref:
159agency_to_titles[child_name].add(ref["title"])
160return agency_to_titles
161def extract_agency_hierarchy(agencies):
162"""Extract the hierarchy of agencies for visualization"""
163nodes = []
164edges = []
165# Function to process an agency and its children
166def process_agency(agency, parent=None):
167agency_id = agency["slug"]
168agency_name = agency["name"]
169# Add node
170nodes.append({
171"id": agency_id,
172"name": agency_name,
173"short_name": agency.get("short_name", ""),
174"is_parent": len(agency.get("children", [])) > 0
175})
176# Add edge if there is a parent
177if parent:
178edges.append((parent, agency_id))
179# Process children
180for child in agency.get("children", []):
181process_agency(child, agency_id)
182# Process all top-level agencies
183for agency in agencies:
184process_agency(agency)
185return nodes, edges
186def create_agency_hierarchy_graph(agencies):
187"""Create a NetworkX graph for agency hierarchy"""
188nodes, edges = extract_agency_hierarchy(agencies)
189G = nx.DiGraph()
190# Add nodes with attributes
191for node in nodes:
192G.add_node(node["id"],
193name=node["name"],
194short_name=node["short_name"],
195is_parent=node["is_parent"])
196# Add edges
197G.add_edges_from(edges)
198return G, nodes
199def calculate_word_counts_over_time(agencies, titles_info, years, max_titles, throttle_delay):
200"""Calculate word counts for each agency over multiple years"""
201# Get agency to title mapping
202agency_to_titles = create_agency_title_mapping(agencies)
203# Create reverse mapping (title -> agencies)
204title_to_agencies = defaultdict(list)
205for agency, titles in agency_to_titles.items():
206for title in titles:
207title_to_agencies[title].append(agency)
208# Dictionary to store word counts by year and agency
209word_counts_by_year = {}
210# Process titles for each year
211for year in years:
212target_date = f"{year}-01-01"
213st.write(f"Processing year: {year}")
214progress_bar = st.progress(0)
215# Dictionary to store word counts by title for this year
216title_word_counts = {}
217# Process titles (limited to max_titles)
218process_size = min(max_titles, len(titles_info))
219for i, title_info in enumerate(titles_info[:process_size]):
220title_number = title_info["number"]
221title_name = title_info.get("name", "Unknown")
222latest_date = title_info["latest_amended_on"]
223# Update progress
224progress = (i + 1) / process_size
225progress_bar.progress(progress)
226# Skip reserved titles
227if title_info.get("reserved", False):
228continue
229# Determine date to use
230use_date = target_date
231if datetime.strptime(latest_date, "%Y-%m-%d") < datetime.strptime(target_date, "%Y-%m-%d"):
232use_date = latest_date
233# Fetch and process content
234title_content = get_title_content(title_number, use_date)
235if title_content:
236word_count = count_words_in_xml(title_content)
237title_word_counts[title_number] = word_count
238# Add delay to avoid rate limiting
239time.sleep(throttle_delay)
240# Calculate word counts per agency for this year
241agency_word_counts = defaultdict(int)
242for title, word_count in title_word_counts.items():
243agencies_for_title = title_to_agencies.get(title, [])
244if agencies_for_title:
245# If a title is regulated by multiple agencies, distribute the word count
246count_per_agency = word_count / len(agencies_for_title)
247for agency in agencies_for_title:
248agency_word_counts[agency] += count_per_agency
249# Store results for this year
250word_counts_by_year[year] = dict(agency_word_counts)
251progress_bar.empty()
252return word_counts_by_year
253def main():
254global request_timeout, skip_problematic_titles, throttle_delay, cache_results
255st.title("Code of Federal Regulations Analyzer")
256st.write("""
257This app analyzes the word count of federal regulations per agency using the Electronic Code of Federal Regulations (eCFR) API.
258""")
259# Sidebar configuration
260st.sidebar.header("Analysis Configuration")
261# Date input with default of 2025-01-01
262default_date = datetime(2025, 1, 1)
263target_date = st.sidebar.date_input("Select a Date Cutoff", value=default_date)
264target_date_str = target_date.strftime("%Y-%m-%d")
265# Sample size selector
266max_titles = st.sidebar.slider("Number of titles to process", 1, 50, 5,
267help="Higher values provide more complete results but take longer to process")
268# Advanced options
269with st.sidebar.expander("Advanced Options"):
270cache_results = st.checkbox("Cache results", value=True,
271help="Store results to avoid reprocessing if parameters don't change")
272throttle_delay = st.slider("API request delay (seconds)", 0.1, 2.0, 0.2, 0.1,
273help="Increase to avoid API rate limiting")
274request_timeout = st.slider("Request timeout (seconds)", 30, 600, 240, 30,
275help="Maximum time to wait for API response before timing out (helps with 504 errors)")
276skip_problematic_titles = st.checkbox("Skip known large titles", value=True,
277help="Skip titles known to cause timeouts (7, 12, 20, 21, 31, 40, 42, 46, 47, 48, 49, 50)")
278# Analytics tabs
279tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["Agency Analysis", "Title Analysis", "Agency Hierarchy", "Word Count Over Time", "Regulatory Composition",'About'])
280with tab1:
281st.header("Regulation Word Count by Agency")
282if st.button("Calculate Word Counts", type="primary"):
283# Create a cache key based on the parameters
284cache_key = f"agency_analysis_{max_titles}_{target_date_str}_{skip_problematic_titles}_{request_timeout}_{throttle_delay}"
285# Check if we have cached results and caching is enabled
286if cache_results and cache_key in st.session_state:
287st.success("Using cached results. Toggle 'Cache results' off to force recalculation.")
288# Retrieve cached data
289agencies = st.session_state[f"{cache_key}_agencies"]
290titles_info = st.session_state[f"{cache_key}_titles_info"]
291title_word_counts = st.session_state[f"{cache_key}_title_word_counts"]
292df = st.session_state[f"{cache_key}_all_agencies_df"]
293senior_df = st.session_state[f"{cache_key}_senior_agencies_df"]
294title_df = st.session_state[f"{cache_key}_title_df"]
295agency_word_counts = st.session_state[f"{cache_key}_agency_word_counts"]
296# Store in session state for other tabs
297st.session_state.agencies = agencies
298st.session_state.titles_info = titles_info
299st.session_state.title_df = title_df
300st.session_state.title_word_counts = title_word_counts
301st.session_state.agency_word_counts = agency_word_counts
302st.session_state.all_agencies_df = df
303st.session_state.senior_agencies_df = senior_df
304else:
305# Initialize progress tracking
306progress_bar = st.progress(0)
307status_text = st.empty()
308results_placeholder = st.empty()
309try:
310status_text.text("Fetching agencies and titles...")
311# Force refresh if cache disabled
312agencies = get_agencies(force_refresh=not cache_results)
313titles_info = get_titles(force_refresh=not cache_results)
314# Store in session state for other tabs
315st.session_state.agencies = agencies
316st.session_state.titles_info = titles_info
317# Create agency-title mappings
318agency_to_titles = create_agency_title_mapping(agencies)
319title_to_agencies = defaultdict(list)
320for agency, titles in agency_to_titles.items():
321for title in titles:
322title_to_agencies[title].append(agency)
323# Identify most senior agencies (those with no parent)
324all_child_names = set()
325for agency in agencies:
326for child in agency.get("children", []):
327all_child_names.add(child["name"])
328# Process titles
329title_word_counts = {}
330process_size = min(max_titles, len(titles_info))
331col1, col2 = st.columns(2)
332with col1:
333st.subheader("Processing Status")
334processing_status = st.empty()
335with col2:
336st.subheader("Current Statistics")
337stats_display = st.empty()
338for i, title_info in enumerate(titles_info[:process_size]):
339title_number = title_info["number"]
340title_name = title_info.get("name", "Unknown")
341latest_date = title_info["latest_amended_on"]
342# Update progress
343progress = (i + 1) / process_size
344progress_bar.progress(progress)
345processing_status.write(f"Processing title {i+1} of {process_size}: Title {title_number} - {title_name}")
346# Skip reserved titles
347if title_info.get("reserved", False):
348processing_status.write(f"Skipping reserved title {title_number}")
349continue
350# Determine date to use
351use_date = target_date_str
352if datetime.strptime(latest_date, "%Y-%m-%d") < datetime.strptime(target_date_str, "%Y-%m-%d"):
353use_date = latest_date
354processing_status.write(f"Using latest available date ({latest_date}) for Title {title_number}")
355# Fetch and process content with cache control
356title_content = get_title_content(title_number, use_date, force_refresh=not cache_results)
357if title_content:
358word_count = count_words_in_xml(title_content)
359title_word_counts[title_number] = word_count
360# Update stats
361stats_display.write(f"""
362**Title {title_number}**
363- Name: {title_name}
364- Word count: {word_count:,}
365- Date analyzed: {use_date}
366""")
367# Add delay to avoid rate limiting
368time.sleep(throttle_delay)
369# Calculate word counts per agency
370agency_word_counts = defaultdict(int)
371for title, word_count in title_word_counts.items():
372agencies_for_title = title_to_agencies.get(title, [])
373if agencies_for_title:
374# If a title is regulated by multiple agencies, distribute the word count
375count_per_agency = word_count / len(agencies_for_title)
376for agency in agencies_for_title:
377agency_word_counts[agency] += count_per_agency
378# Create DataFrame with all agencies
379df = pd.DataFrame(list(agency_word_counts.items()), columns=["Agency", "Word Count"])
380df["Word Count"] = df["Word Count"].astype(int)
381df = df.sort_values(by="Word Count", ascending=False).reset_index(drop=True)
382# Create a DataFrame with only top-level (senior) agencies
383senior_agencies = [agency["name"] for agency in agencies]
384senior_df = df[df["Agency"].isin(senior_agencies)]
385senior_df = senior_df.sort_values(by="Word Count", ascending=False).reset_index(drop=True)
386# Create title word count DataFrame
387title_df = pd.DataFrame({
388"Title": [f"Title {title}" for title in title_word_counts.keys()],
389"Word Count": list(title_word_counts.values())
390})
391title_df = title_df.sort_values(by="Word Count", ascending=False).reset_index(drop=True)
392# Store data in session state
393st.session_state.agency_word_counts = agency_word_counts
394st.session_state.all_agencies_df = df
395st.session_state.senior_agencies_df = senior_df
396st.session_state.title_df = title_df
397st.session_state.title_word_counts = title_word_counts
398# Cache results if enabled
399if cache_results:
400st.session_state[f"{cache_key}_agencies"] = agencies
401st.session_state[f"{cache_key}_titles_info"] = titles_info
402st.session_state[f"{cache_key}_title_word_counts"] = title_word_counts
403st.session_state[f"{cache_key}_all_agencies_df"] = df
404st.session_state[f"{cache_key}_senior_agencies_df"] = senior_df
405st.session_state[f"{cache_key}_title_df"] = title_df
406st.session_state[f"{cache_key}_agency_word_counts"] = agency_word_counts
407# Clear placeholders
408progress_bar.empty()
409status_text.empty()
410processing_status.empty()
411stats_display.empty()
412# Display results
413results_placeholder.write("### Results")
414# Add tabs for different views
415results_tab1, results_tab2 = st.tabs(["Senior Agencies", "All Agencies"])
416with results_tab1:
417st.write(f"Showing {len(senior_df)} Agencies")
418st.dataframe(senior_df, use_container_width=True)
419# Visualization for senior agencies only
420st.subheader("Top 10 Agencies by Word Count")
421fig, ax = plt.subplots(figsize=(12, 6))
422top_df = senior_df.head(10) if len(senior_df) > 10 else senior_df
423bars = ax.bar(top_df["Agency"], top_df["Word Count"])
424# Calculate the maximum bar height for y-axis adjustment
425max_height = top_df["Word Count"].max()
426# Add value labels on top of each bar
427for bar in bars:
428height = bar.get_height()
429ax.text(bar.get_x() + bar.get_width()/2., height + max_height*0.01,
430f'{height:,}', ha='center', va='bottom', rotation=0)
431# Set y-axis limit with 15% padding to avoid clipping
432ax.set_ylim(0, max_height * 1.15)
433plt.xticks(rotation=45, ha="right")
434plt.title(f"Top Agencies by Regulation Word Count (as of {target_date_str})")
435plt.ylabel("Word Count")
436plt.tight_layout()
437st.pyplot(fig)
438with results_tab2:
439st.write(f"Showing all {len(df)} agencies (including child agencies)")
440st.dataframe(df, use_container_width=True)
441# Download button for results
442csv = df.to_csv(index=False)
443st.download_button(
444label="Download complete agency data as CSV",
445data=csv,
446file_name=f"regulation_wordcount_by_agency_{target_date_str}.csv",
447mime="text/csv"
448)
449# Store title data for the second tab
450st.session_state.title_df = title_df
451st.session_state.title_word_counts = title_word_counts
452except Exception as e:
453st.error(f"An error occurred: {str(e)}")
454st.write("This could be due to API rate limits or unavailability. Please try again later.")
455with tab2:
456st.header("Title-Level Analysis")
457if 'title_df' in st.session_state:
458st.subheader("Word Count by Title")
459st.dataframe(st.session_state.title_df, use_container_width=True)
460# Visualization for titles
461st.subheader("Top 10 Titles by Word Count")
462fig, ax = plt.subplots(figsize=(12, 6))
463top_title_df = st.session_state.title_df.head(10)
464bars = ax.bar(top_title_df["Title"], top_title_df["Word Count"])
465# Add value labels
466for bar in bars:
467height = bar.get_height()
468ax.text(bar.get_x() + bar.get_width()/2., height + 5,
469f'{height:,}', ha='center', va='bottom', rotation=0)
470plt.xticks(rotation=45, ha="right")
471plt.title(f"Top Titles by Word Count (as of {target_date_str})")
472plt.ylabel("Word Count")
473plt.tight_layout()
474st.pyplot(fig)
475# Download button for title data
476csv = st.session_state.title_df.to_csv(index=False)
477st.download_button(
478label="Download title data as CSV",
479data=csv,
480file_name=f"regulation_wordcount_by_title_{target_date_str}.csv",
481mime="text/csv"
482)
483else:
484st.info("Please run the analysis in the 'Agency Analysis' tab first to view title-level data.")
485with tab3:
486st.header("Agency Hierarchy Visualization")
487# Check if agencies data is available or fetch it
488if 'agencies' not in st.session_state:
489with st.spinner("Fetching agency data..."):
490st.session_state.agencies = get_agencies()
491if st.session_state.agencies:
492# Get top-level agencies (most senior parents)
493top_level_agencies = [a for a in st.session_state.agencies]
494# Find agencies with children (parents)
495parent_agencies = []
496# Function to find parent agencies recursively
497def find_parent_agencies(agency_list):
498for agency in agency_list:
499# Check if this agency has children
500if agency.get("children", []) and len(agency["children"]) > 0:
501parent_agencies.append(agency)
502# Also check children recursively for any that might be parents
503find_parent_agencies(agency["children"])
504# Process all agencies to find parent ones
505find_parent_agencies(top_level_agencies)
506# Find independent agencies (those not listed as children of any other agency)
507all_child_slugs = set()
508for agency