A Python & SQL Email Parser

Boost your productivity by parsing and extracting data from Outlook emails

This source code is the complete code from a Medium article that I wrote on how to parse Outlook emails. The sample email that I used in the article can be found on Github.

Requirements:

  • comtypes==1.1.7
  • pywin32==300
  • Note1: pip install wheel before installatation if you face any errors
  • Note2: This code should work on Python 3.8.5 and above
In [1]:
import sqlite3
from tkinter.filedialog import askdirectory
import os, re, html
import win32com.client
def main(): # Set up database db = setup() # Create an folder input dialog with tkinter folder_path = os.path.normpath(askdirectory(title='Select Folder')) # Initialise & populate list of emails email_list = [file for file in os.listdir(folder_path) if file.endswith(".msg")] # Connect to Outlook with MAPI outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI") # Iterate through every email for i, _ in enumerate(email_list): # Create variable storing info from current email being parsed msg = outlook.OpenSharedItem(os.path.join(folder_path, email_list[i])) # Search email HTML for body text regex = re.search(r"<body([\s\S]*)</body>", msg.HTMLBody) body = regex.group() # Search email body text for unique entries pattern = r"li class=\"MsoListParagraph\"([\s\S]*?)</li>" results = re.findall(pattern, body) for header in results: regex = re.search(r"[^<>]+(?=\(|sans-serif'>([\s\S]*?)</span>)", header) # HTML unescape to get remove remaining HTML title_pub = html.unescape(regex.group()) # Split data split_list = title_pub.split("–") title = split_list[0].strip() publication = split_list[1].strip() # List of publications to check for platform_list = ["Online", "Facebook", "Instagram", "Twitter", "LinkedIn", "Linkedin", "Youtube"] # Create variable with list of publications platforms = [p for p in platform_list if p in header] # Find all links using regex links = re.findall(r"<a href=\"([\s\S]*?)\">", header) sql_insert(db, title, publication, platforms, links) def sql_insert(db, title, publication, platforms, links): # Insert title & pub by substituting values into each ? placeholder db.execute("INSERT INTO articles (title, publication) VALUES (?, ?)", (title, publication)) # Get article id and copy to platforms & links tables article_id = db.execute("SELECT id FROM articles WHERE title = ?", (title,)) # Copy id from main table into platforms and links table db.execute("INSERT INTO platforms (article_id) SELECT id FROM articles WHERE title = ?", (title,)) db.execute("INSERT INTO links (article_id) SELECT id FROM articles WHERE title = ?", (title,)) for item in article_id: _id = item[0] for i, _ in enumerate(platforms): db.execute(f"UPDATE platforms SET platform{i} = ? WHERE article_id = ?", (platforms[i], _id)) for i, _ in enumerate(links): db.execute(f"UPDATE links SET link{i} = ? WHERE article_id = ?", (links[i], _id)) db.commit() def setup(): # Create & connect to database db = sqlite3.connect("emails.db") # Create empty tables db.execute(""" CREATE TABLE IF NOT EXISTS "articles" ( "id" INTEGER, "title" TEXT UNIQUE, "publication" TEXT, PRIMARY KEY("id" AUTOINCREMENT)) """) db.execute(""" CREATE TABLE IF NOT EXISTS "links" ( "article_id" INTEGER, "link0" TEXT, "link1" TEXT, "link2" TEXT, PRIMARY KEY("article_id")) """) db.execute(""" CREATE TABLE IF NOT EXISTS "platforms" ( "article_id" INTEGER, "platform0" TEXT, "platform1" TEXT, "platform2" TEXT, PRIMARY KEY("article_id")) """) # Reset databases after each run db.execute("DELETE FROM articles") db.execute("DELETE FROM platforms") db.execute("DELETE FROM links") db.execute("DELETE FROM sqlite_sequence") db.commit() return db main()