This is a continuation of my last post “Leads Automation: Using LLM to estimate webpage freshness“
In that post I used a function estimate_last_update_with_llm(url) to grab the contents of a webpage—or at least the first 5000 characters—and send them off to ChatGPT 4o to try to ascertain how old it thought the blog was.
It was supposed to use a lot of fancy tricks to get a good idea of when it was last updated, so that I can focus my efforts on actively maintained leads instead of long-dead ghost blogs.
Though it does work, it didn’t work as well as I would have liked, and I ran put of OpenAI API credits in the process.
So now I am going to try the same thing, but using pre-LLM technology, namely, regular expressions.
Now I already have a version that used the meta “modified” header, but that version gave misleading results. I am hoping to do better with regular expressions that target the date of the last post.
Here is the new function to use:
from bs4 import BeautifulSoup
import re
import csv
import requests
import openai
from datetime import datetime
import re
def estimate_last_updated_no_llm(url):
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
if response.status_code != 200:
return None # Return None if the page fails to load
soup = BeautifulSoup(response.text, 'html.parser')
# Look for <time> elements which often contain blog post dates
time_tags = soup.find_all('time')
dates = []
for tag in time_tags:
date_text = tag.get('datetime') or tag.text.strip()
dates.append(date_text)
# Extract potential dates from the page content
date_patterns = [
r'(\b\d{4}-\d{2}-\d{2}\b)', # YYYY-MM-DD
r'(\b\d{2}/\d{2}/\d{4}\b)', # MM/DD/YYYY or DD/MM/YYYY
r'(\b\d{4}/\d{2}/\d{2}\b)', # YYYY/MM/DD
r'(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}, \d{4}\b)', # Month DD, YYYY
]
text = soup.get_text()
for pattern in date_patterns:
match = re.findall(pattern, text)
if match:
dates.extend(match)
# Convert dates to datetime objects for sorting
parsed_dates = []
for date_str in set(dates): # Use set to remove duplicates
for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%Y/%m/%d", "%B %d, %Y"):
try:
parsed_dates.append(datetime.strptime(date_str, fmt))
break # Stop checking once successfully parsed
except ValueError:
continue
if parsed_dates:
last_update = max(parsed_dates).date() # Keep as a date object for Excel compatibility
return last_update # Return a `datetime.date` object for proper Excel sorting
return None # Return None if no valid date was found
except Exception as e:
return None # Return None on error
This works well enough without the overhead and cost of an LLM. I ‘m going to go with it for now anyhow.
Now let’s also see if we can grab an email address from the blog. Ultimately, I will want to contact the blog owner, so if the system can intelligently grab a contact email, that might save me some time.
I can predict some problems right away though. Am I going to use regular expressions to just grab any email address? It may be useful to just do that, but I won’t be surprised if the quality is too low.
Ultimately, I would want an intelligent function to look for things like a “Contact” page, or section. Also decode those emails where people spell them out to fool bots.
Of course, email might not even be the best way to contact the blogmaster. What if my function was smart enough to figure out how they should be contacted.
That might be a job for an LLM again. But I’m not doing that now.
Let’s just try this:
def extract_emails_from_webpage(url):
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
if response.status_code != 200:
return None # Page retrieval failed
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text() # Extract visible text from the webpage
# Regular expression to match email addresses
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, text)
return list(set(emails)) if emails else None # Remove duplicates and return list
except Exception as e:
return None # Return None on error
Hmmm.. It didn’t grab the email from a test page that has one.
It took me a while to figure out that my test page with the email was returning a 406 status code, “Unacceptable Response”.
Eventually I got it working with a better Accept header:
def extract_emails_from_webpage(url):
try:
response = requests.get(url, headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'
})
#print('response.status_code=' + str(response.status_code))
# I managed to get a status 406 "Unacceptable Response" so I had to add better Accept header
if response.status_code != 200:
return None # Page retrieval failed
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text() # Extract visible text from the webpage
# Regular expression to match email addresses
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, text)
return list(set(emails)) if emails else None # Remove duplicates and return list
except Exception as e:
return None # Return None on error
Here is the complete code.
extract_leads_with_dates_and emails_no_llm.py:
from bs4 import BeautifulSoup
import re
import csv
import requests
import openai
from datetime import datetime
import re
def extract_emails_from_webpage(url):
try:
response = requests.get(url, headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'
})
#print('response.status_code=' + str(response.status_code))
# I managed to get a status 406 "Unacceptable Response" so I had to add better Accept header
if response.status_code != 200:
return None # Page retrieval failed
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text() # Extract visible text from the webpage
# Regular expression to match email addresses
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, text)
return list(set(emails)) if emails else None # Remove duplicates and return list
except Exception as e:
return None # Return None on error
def estimate_last_updated_no_llm(url):
try:
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
if response.status_code != 200:
return None # Return None if the page fails to load
soup = BeautifulSoup(response.text, 'html.parser')
# Look for <time> elements which often contain blog post dates
time_tags = soup.find_all('time')
dates = []
for tag in time_tags:
date_text = tag.get('datetime') or tag.text.strip()
dates.append(date_text)
# Extract potential dates from the page content
date_patterns = [
r'(\b\d{4}-\d{2}-\d{2}\b)', # YYYY-MM-DD
r'(\b\d{2}/\d{2}/\d{4}\b)', # MM/DD/YYYY or DD/MM/YYYY
r'(\b\d{4}/\d{2}/\d{2}\b)', # YYYY/MM/DD
r'(\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}, \d{4}\b)', # Month DD, YYYY
]
text = soup.get_text()
for pattern in date_patterns:
match = re.findall(pattern, text)
if match:
dates.extend(match)
# Convert dates to datetime objects for sorting
parsed_dates = []
for date_str in set(dates): # Use set to remove duplicates
for fmt in ("%Y-%m-%d", "%m/%d/%Y", "%Y/%m/%d", "%B %d, %Y"):
try:
parsed_dates.append(datetime.strptime(date_str, fmt))
break # Stop checking once successfully parsed
except ValueError:
continue
if parsed_dates:
last_update = max(parsed_dates).date() # Keep as a date object for Excel compatibility
return last_update # Return a `datetime.date` object for proper Excel sorting
return None # Return None if no valid date was found
except Exception as e:
return None # Return None on error
def estimate_last_update_with_llm(url):
"""Estimate the last update date of a webpage using ChatGPT."""
try:
# Fetch the page content
response = requests.get(url, timeout=10)
response.raise_for_status()
page_content = response.text[:5000] # Limit to first 5000 characters
# Construct prompt for ChatGPT
'''
prompt = (
"Based on the following webpage content, estimate the most recent update date if possible. "
"Look for any visible timestamps, published or modified dates, or context that might indicate recency.\n\n"
f"Content:\n{page_content}\n\nEstimated last update date:"
)
'''
prompt = (
"Based on the following URL, estimate the most recent update date if possible. "
"Look for any visible timestamps, published or modified dates, or context that might indicate recency.\n\n"
f"URL:\n{url}\n\nEstimated last update date:"
)
# Initialize OpenAI client
client = openai.OpenAI()
# Call OpenAI API
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "system", "content": "You are an expert in analyzing webpage content."},
#{"role": "user", "content": prompt}]
{"role": "user", "content": f"Based on this webpage, when was it last updated?\n\n{page_content}"}
])
# Extract the estimated date from response
estimated_date = response.choices[0].message.content.strip()
return estimated_date
except requests.RequestException:
return "Error fetching the webpage"
except Exception as e:
return f"LLM estimation failed: {str(e)}"
def get_last_modified(url):
"""Retrieve the Last-Modified header from a URL if available."""
try:
response = requests.head(url, timeout=5)
return response.headers.get("Last-Modified", "Unknown")
except requests.RequestException:
return "Unknown"
def extract_urls_from_html(html_path):
"""Extract all URLs, their link text, and last modified dates from an HTML file."""
with open(html_path, "r", encoding="utf-8") as file:
soup = BeautifulSoup(file, "html.parser")
url_data = set() # Use a set to avoid duplicates
# Extract URLs from <a href="…"> and their link text
count = 0
for link in soup.find_all("a", href=True):
url = link["href"].strip()
link_text = link.get_text(strip=True) # Extract visible link text
last_modified = estimate_last_updated_no_llm(url)
email = extract_emails_from_webpage(url)
print("email =", str(email))
url_data.add((url, link_text, last_modified, str(email))) # Ensuring 4 values
print(f"{count} : {url} : {link_text} : {last_modified}")
count += 1
# Extract URLs appearing as plain text
url_pattern = re.compile(r"https?://[^\s\"'>]+")
for text in soup.stripped_strings:
for match in url_pattern.findall(text):
last_modified = estimate_last_updated_no_llm(match)
email = extract_emails_from_webpage(match)
print("email=" + str(email))
url_data.add((match, "", last_modified, str(email))) # Add empty email field to match format
return list(url_data)
def save_urls_to_csv(url_data, output_file):
"""Save extracted URLs, their link text, and last modified dates to a CSV file."""
with open(output_file, "w", newline="", encoding="utf-8") as file:
writer = csv.writer(file)
writer.writerow(["URL", "Link Text", "Last Modified", "Email"]) # Header row
writer.writerows(url_data)
print(f"URLs saved to {output_file}")
if __name__ == "__main__":
html_path = "test_leads.html" #just do a small list for testing
extracted_urls = extract_urls_from_html(html_path)
if extracted_urls:
output_file = "lead_urls.csv"
print("Extracted URLs:")
for url, text, last_modified, email in extracted_urls:
print(f"URL: {url}, Link Text: {text}, Last Modified: {last_modified}, Email: {email}")
save_urls_to_csv(extracted_urls, output_file)
else:
print("No URLs found in the HTML file.")