-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathdepToStructConv.py
More file actions
executable file
·85 lines (65 loc) · 2.53 KB
/
depToStructConv.py
File metadata and controls
executable file
·85 lines (65 loc) · 2.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python2.7
"""
Convert treebank-style dependency parses to FUDG JSON data structures (one per line).
Arguments: <disambiguated_tokens_one_per_line> <dep_parses_pennconverter_tab_format>
@author: Manaal Faruqui (mfaruqui@cs.cmu.edu)
@since: 2013-02-18
"""
# Edges are like this: [to, from , None]
import sys, json
ROOT = "W($$)"
rawFile = sys.argv[1]
depFile = sys.argv[2]
def get_edges(nodeIndices, indexedEdges, puncNodes):
node_edges = []
for (fromNode, toNode) in indexedEdges:
if fromNode in puncNodes and toNode in puncNodes:
pass
elif fromNode in puncNodes:
pass
elif toNode in puncNodes:
node_edges.append([ROOT, nodeIndices[fromNode], None])
else:
if toNode == '0':
node_edges.append([ROOT, nodeIndices[fromNode], None])
else:
node_edges.append([nodeIndices[toNode], nodeIndices[fromNode], None])
return node_edges
def dump_data(tokens, nodes, puncNodes, nodeIndices, node2words, indexedEdges):
node_edges = get_edges(nodeIndices, indexedEdges, puncNodes)
sent_struct = {'tokens': tokens, 'nodes': nodes, 'node2words': node2words, 'node_edges': node_edges}
#print 'tokens:', tokens
#print 'nodes:', nodes
#print 'punc nodes:', puncNodes
#print 'node2words:', node2words
#print 'node_edges:', node_edges
print json.dumps(sent_struct)
def initialize():
return ([],[ROOT],[],{},{},[])
tokens, nodes, puncNodes, nodeIndices, node2words, indexedEdges = initialize()
for lineRaw, lineDep in zip(open(rawFile,'r'),open(depFile, 'r')):
lineDep = lineDep.strip()
lineRaw = lineRaw.strip()
if lineDep == '':
dump_data(tokens, nodes, puncNodes, nodeIndices, node2words, indexedEdges)
tokens, nodes, puncNodes, nodeIndices, node2words, indexedEdges = initialize()
continue
index, word, pos, info = lineDep.split('\t')
connTo, role = info.split('/')
word = lineRaw
if pos not in ('.',',',':','(',')',"``","''","`","'",'"'):
node = 'W('+word+')'
tokens.append(word)
nodes.append(node)
nodeIndices[index] = node
node2words[node] = [word]
indexedEdges.append((index, connTo))
else:
#node = 'W('+word+')'
tokens.append(word)
#nodes.append(node)
puncNodes.append(index)
#nodeIndices[index] = node
#node2words[node] = word
indexedEdges.append((index, connTo))
dump_data(tokens, nodes, puncNodes, nodeIndices, node2words, indexedEdges)