Free Python Script for Extracting Content Publish Dates

!pip install beautifulsoup4

import csv
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def fetch_published_date(url):
    """
    Fetches the published date of an article from its meta properties.

    Parameters:
    - url (str): The URL of the web page.

    Returns:
    - str: The published date in 'YYYY-MM-DD' format if available, else an empty string.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            meta_tag = soup.find('meta', property='article:published_time')
            if meta_tag and 'content' in meta_tag.attrs:
                published_time = meta_tag.attrs['content']
                published_date = published_time.split('T')[0]  # Extracts date part
                return published_date
        return ""
    except Exception as e:
        return ""

# Assuming 'urls.tsv' is the file where each line is a URL
urls_file = 'urls.tsv'
results = []

with open(urls_file, 'r') as file:
    tsv_reader = csv.reader(file, delimiter='\\t')
    for row in tsv_reader:
        url = row[0]  # Assuming each row contains a URL in the first column
        published_date = fetch_published_date(url)
        results.append((url, published_date))

# Save the results to a new TSV file
output_file = 'published_dates.tsv'

with open(output_file, 'w', newline='', encoding='utf-8') as file:
    tsv_writer = csv.writer(file, delimiter='\\t')
    tsv_writer.writerow(['URL', 'Published Date'])  # Header
    for url, date in results:
        tsv_writer.writerow([url, date])