-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathpreproc.py
More file actions
49 lines (39 loc) · 1.41 KB
/
preproc.py
File metadata and controls
49 lines (39 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os,sys,re,cgi
from collections import defaultdict
# 28849570224 @ciaranyree it was on football wives , one of the players and his wife own smash burger @ O V P N N , $ P D N & D N V ^ ^ kgimpel@ANDREW.CMU.EDU Thu Feb 17 2011 13:29:43 GMT-0500 (Eastern Standard Time)
outbase = sys.argv[1]
for tweet_i,line in enumerate(sys.stdin):
parts = line.split('\t')
tweet_id = parts[0]
tokens = parts[2].split()
poses = parts[3].split()
seen_words = set()
words_needing_dedup = set()
counts = defaultdict(int)
for tok in tokens: counts[tok] += 1
words_needing_dedup = {w for w in counts if counts[w] > 1}
def needs_dedup(i):
if tokens[i] not in words_needing_dedup: return False
p = poses[i]
if p==',': return False
if p=='!': return False
return True
output_tokens = []
counters = defaultdict(lambda:0)
for i,(tok,pos) in enumerate(zip(tokens,poses)):
if needs_dedup(i):
counters[tok] += 1
tok = tok + '^' + str(counters[tok])
output_tokens.append(tok)
with open("%s.%04d.anno" % (outbase, tweet_i),'w') as f:
print>>f, "---"
print>>f, "% ID", tweet_id
print>>f, "% POS TEXT"
print>>f, ' '.join('%s/%s' % (tok,pos) for tok,pos in zip(tokens,poses))
print>>f, "% TARGET TEXT"
print>>f, ""
print>>f, ' '.join(output_tokens)
print>>f, ""
print>>f, "% ANNO"
print>>f, ""
print>>f, ' '.join(output_tokens)