#!/usr/bin/env python "finds email addresses in database of PEGs" import MySQLdb, re hostname='stillwaterfs.cfaw.info' dbuser='cfaw' dbpass='xxxx' dbname='cfaw' dbtable='pevcf_raw' dbresult='pevcf_people' db=MySQLdb.connect(host=hostname, user=dbuser, passwd=dbpass, db=dbname,charset="utf8",use_unicode=True) cursor=db.cursor() cursor.execute("SELECT * FROM " + dbtable) # + " WHERE ID1 >= 0 AND ID1 < 100") names = [f[0] for f in cursor.description] #for row in cursor.fetchall(): # for pair in zip(names, row): # print '%s: %s' % pair people=names.index('People') id=names.index('ID1') website=names.index('Website') cu_result=db.cursor() cu_result.execute("DELETE FROM " + dbresult) insresult="INSERT INTO " + dbresult + " (ID1,email,name,greeting) VALUES (%s,%s,%s,%s);" #standard email re: \\b[A-Za-z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}\\b emailbasic='[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}' emailpattern=re.compile('\\b(' + emailbasic + ')\\b' + '">([^>]*)>', re.IGNORECASE | re.UNICODE) #firstname=re.compile('\\b(\w{3,})\\b', re.IGNORECASE | re.UNICODE) firstname=re.compile('\\b([^ ]{2,}) ', re.IGNORECASE | re.UNICODE) for row in cursor.fetchall(): if row[people] is not None: emailads=emailpattern.findall(row[people]) for emailad in emailads: first = firstname.findall(emailad[1]) if first[0] == 'Wm': greeting = 'William' #print '%s\t - %s' % (greeting, emailad[1]) elif first[0] == 'Dr': #in ('Mr',insresult'Mr.','Dr','Dr.','Ms','Ms.'): if len(first) <= 2: greeting = emailad[1].strip() #print '%s\t - %s' % (greeting, emailad[1]) else: greeting = first[1].strip() #print '%s\t - %s' % (greeting, emailad[1]) elif first[0] in ('De','de',"d'"): greeting = emailad[1].strip() #print '%s\t - %s' % (greeting, emailad[1]) elif len(first) == 1: greeting = 'Mr ' + first[0].strip() #print '%s\t - %s' % (greeting, emailad[1]) else: greeting = first[0].strip() #print '%s\t - %s' % (greeting, emailad[1]) #print '%s\t - %s' % (greeting, emailad[1]) #print '[%s]\t%s\t(%s)\t%s' % (row[id], emailad[0].strip(), emailad[1].strip(), greeting) #print insresult % (row[id], emailad[0].strip(), emailad[1].strip(), greeting) cu_result.execute(insresult,(row[id], emailad[0].strip(), emailad[1].strip(), greeting)) #print row[id], row[website],row[people] #print "found:", emailads db.commit() cu_result.close() cursor.close() db.close()