#!/usr/bin/env python "processes a file to std out" #todo: get rid of html in fields TSV=1 HTML=0 SESSIONBREAK=0 CHECKSEQ=1 OFFSET=500 import sys, re f = open(sys.argv[1]) #sessionID=re.compile('>(\d\d\d)') sessionID=re.compile('>(\d\d\d) \d') speaker=re.compile('>([^<\(]*)<') speakerplace=re.compile('>([^<\(]*)\(([^<\)]*)\)<') panel=re.compile('>((Panel|Moderator)[^<]*)<') #subject=re.compile('<*B*>*([A-Z ,&/]{4,})([A-Z][A-Z 0-9.,&/-]{5,})THEME: ([^<]*)(.*)','').replace('','') s2 = s1.replace('','').replace('','') return s2.replace('xxxxx','x') def noval(): if (HTML): return ' ' else: return '' if (HTML): print 'ETS 2008' print '' print '

ETS 2008

' print '' if (not SESSIONBREAK): print '' for line in f.readlines(): if status == 1: sub= subject.search(line) id = sessionID.search(line) th = theme.search(line) if sub: #print sub.group(1) subjectname = sub.group(1) themename = '' if(SESSIONBREAK): print '' elif th: #print th.group(1) themename = th.group(1) if(SESSIONBREAK): print '' elif id: #print id.group(1), idname = id.group(1) status = 2 elif status == 2: sp = speaker.search(line) sppl = speakerplace.search(line) pa = panel.search(line) if pa: titlename = pa.group(1) speakername = noval() placename = noval() status = 4 elif sppl: speakername = sppl.group(1) placename = sppl.group(2) #print speakername + "[" + placename + "]" status = 3 elif sp: speakername = sp.group(1) if (speakername.startswith('General Discussion')): titlename = speakername speakername = noval() status = 4 else: status = 3 placename = noval() #print sp.group(1) elif status == 3: ti = title.search(line) if ti: titlename=xtags(ti.group(1)) #print ti.group(1) status = 4 if (status == 4): counter += 1 if (CHECKSEQ): if (int(idname) != counter): print '' counter = int(idname) if (OFFSET != 0): idname = str(int(idname) + OFFSET) if (HTML): print '' if (not SESSIONBREAK): print '' if (TSV): print idname+'\t'+titlename+'\t'+speakername+'\t'+placename+'\t'+subjectname+'\t'+themename status = 1 # for line in f.readlines(): # id = sessionID.search(line) # if id: # print id.group(1), # sp = speaker.search(line) # if sp: # print sp.group(1) if (HTML): print '
IDTitleSpeakerFromSubjectTheme
' + subjectname + 'Theme: ' + themename + '
MISSED COUNT: ' + idname + '(' + str(counter) + ')
'+idname+''+titlename+''+speakername+''+placename+''+subjectname+''+themename+'
' f.close()