Free Python Script for Extracting Content Publish Dates
!pip install beautifulsoup4
import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime
def fetch_published_date(url):
"""
Fetches the published date of an article from its meta properties.
Parameters:
- url (str): The URL of the web page.
Returns:
- str: The published date in 'YYYY-MM-DD' format if available, else an empty string.
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
meta_tag = soup.find('meta', property='article:published_time')
if meta_tag and 'content' in meta_tag.attrs:
published_time = meta_tag.attrs['content']
published_date = published_time.split('T')[0] # Extracts date part
return published_date
return ""
except Exception as e:
return ""
# Assuming 'urls.tsv' is the file where each line is a URL
urls_file = 'urls.tsv'
results = []
with open(urls_file, 'r') as file:
tsv_reader = csv.reader(file, delimiter='\\t')
for row in tsv_reader:
url = row[0] # Assuming each row contains a URL in the first column
published_date = fetch_published_date(url)
results.append((url, published_date))
# Save the results to a new TSV file
output_file = 'published_dates.tsv'
with open(output_file, 'w', newline='', encoding='utf-8') as file:
tsv_writer = csv.writer(file, delimiter='\\t')
tsv_writer.writerow(['URL', 'Published Date']) # Header
for url, date in results:
tsv_writer.writerow([url, date])