-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathknowledge_graph.py
More file actions
491 lines (417 loc) · 20.1 KB
/
knowledge_graph.py
File metadata and controls
491 lines (417 loc) · 20.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import pickle
def create_knowledge_graph_pql(file_path : str, from_kb = False, hops= 2):
G = nx.MultiDiGraph()
if from_kb:
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
# Remove whitespace at the start and end
line = line.strip()
# Split the line by tabs
parts = line.split('\t')
# Check we have at least three parts
if len(parts) == 3:
head = parts[0]
relation = parts[1]
tail = parts[2]
G.add_edge(head, tail, key=relation, relation=relation)
else:
print(f"Length of line != 3: {line}")
else:
with open(file_path, "r", encoding="utf-8") as f: # "rb" mode means "read binary"
for line in f:
line = line.strip()
splitted_line = line.split('\t')
path = splitted_line[2]
path = path.split('#')
for i in range(hops):
head = path[2*i]
relation = path[2*i+1]
tail = path[2*i+2]
#i = 0 --> 0,1,2
#i= 1 --> 2,3,4
#i=2--> 4,5,6
G.add_edge(head, tail, key=relation, relation=relation)
return G
def create_knowledge_graph_mlpq(txt_file_paths : list, from_kb = True, hops = 2):
G = nx.MultiDiGraph()
if from_kb:
for txt_file_path in txt_file_paths:
with open(txt_file_path, "r", encoding="utf-8") as f:
for line in f:
# Remove whitespace at the start and end
line = line.strip()
# Split the line by tabs
parts = line.split('@@@')
# Check we have at least three parts
if len(parts) == 3:
head = parts[0].lower()
relation = parts[1].lower()
tail = parts[2].lower()
G.add_edge(head, tail, key=relation, relation=relation)
else:
print(f"Length of line != 3: {line}")
else:
for evidence in txt_file_paths['evidences']:
for i in range(hops):
head = evidence[2*i]
relation = evidence[2*i+1]
tail = evidence[2*i+2]
#i = 0 --> 0,1,2
#i= 1 --> 2,3,4
#i=2--> 4,5,6
G.add_edge(head, tail, key=relation, relation=relation)
return G
#Need to use a undirected Graph here otherwise we will not learn the paths neccessarry for the questions
# Example we have a question: which person wrote the films directed by [Yuriy Norshteyn] Sergei Kozlov
#But in the Graph it is:
# Film --> directed by --> Yuriy Norshteyn
# Film --> written_by --> Sergei Kozlov
def create_knowledge_graph_metaqa(data : pd.DataFrame, iterations = 0, from_kb = True, max_answers = 1):
index = 0
G = nx.MultiDiGraph()
if from_kb:
for idx, row in tqdm(data.iterrows(), desc="Creating Knowledge Graph...", total=len(data) if iterations == 0 else iterations, dynamic_ncols=True):
if index > iterations:
return G
head = row['entity1'].strip().lower()
relation = row['relation'].strip().lower()
tail = row['entity2'].strip().lower()
if relation == "has_tags":
continue
G.add_edge(head, tail, key=relation, relation=relation)
if head != tail:
G.add_edge(tail, head, key=f"{relation}_reversed", relation=f"{relation}_reversed")
if iterations > 0:
index += 1
else:
for evidences_list in tqdm(data['evidences']):
if max_answers is None or len(evidences_list) <= max_answers:
for evidence in evidences_list:
entity1, relation1, entity2, relation2, entity3 = evidence
G.add_edge(entity1, entity2, key=relation1, relation=relation1)
G.add_edge(entity2, entity3, key=relation2, relation=relation2)
# print(max_count)
return G
def create_knowledge_graph_wikimultihop(data, iterations = 0):
index = 0
G = nx.MultiDiGraph()
for entry in data['evidences']:
if index > iterations:
return G
for triples in entry:
head = triples[0].strip()
relation = triples[1].strip()
tail = triples[2].strip()
G.add_edge(head, tail, key=relation, relation=relation)
if iterations > 0:
index += 1
return G
from collections import defaultdict
def visualize_knowledge_graph(kg):
# Generate positions for all nodes
pos = nx.spring_layout(kg)
# Draw the nodes and edges
nx.draw(
kg, pos,
with_labels=True,
node_size=1500,
node_color="lightblue",
font_size=7,
font_weight="bold",
edge_color="gray"
)
# For a MultiGraph/MultiDiGraph, multiple edges may exist between the same nodes.
# We need to combine their 'relation' attributes into a single label.
edge_relations = defaultdict(list)
for u, v, data in kg.edges(data=True):
# Extract the relation attribute (if it exists)
relation = data.get('relation', '')
edge_relations[(u, v)].append(relation)
# Combine multiple relations into a single label per node pair
# You could separate them by commas, newlines, or any other delimiter
edge_labels = {edge: "\n".join(rels) for edge, rels in edge_relations.items()}
# Draw the combined edge labels
nx.draw_networkx_edge_labels(kg, pos, edge_labels=edge_labels, font_color='red', font_size=7)
# Show the plot
plt.show()
def print_graph(kg):
print("Nodes:", kg.nodes())
print("Edges:", kg.edges(data=True))
print(f"#Nodes: {len(kg.nodes())}")
print(f"#Edges: {len(kg.edges(data=True))}")
#################################################
#METAQA HOW TO GET THE EVIDENCES
#We use the qtype txt files which kind of gives us the path to the answer
import networkx as nx
import re
import pandas as pd
from collections import Counter
import itertools
from tqdm import tqdm
import ast
import os
# Mapping from (source_type, target_type) to knowledge graph relations
# '_reversed' indicates traversal in the reverse direction (from target to source)
import networkx as nx
import re
import pandas as pd
from collections import Counter
import itertools
from tqdm import tqdm
import ast
import os
# Mapping from (source_type, target_type) to knowledge graph relations
# '_reversed' indicates traversal in the reverse direction (from target to source)
TRANSITION_TO_RELATION = {
('movie', 'language'): 'in_language',
('movie', 'year'): 'release_year',
('movie', 'writer'): 'written_by',
('movie', 'director'): 'directed_by',
('movie', 'genre'): 'has_genre',
('movie', 'actor'): 'starred_actors',
('language', 'movie'): 'in_language_reversed',
('year', 'movie'): 'release_year_reversed',
('writer', 'movie'): 'written_by_reversed',
('director', 'movie'): 'directed_by_reversed',
('genre', 'movie'): 'has_genre_reversed',
('actor', 'movie'): 'starred_actors_reversed',
}
def extract_starting_entity(question):
"""
Extracts the starting entity from the question.
Assumes the entity is enclosed within square brackets [ ].
"""
match = re.search(r'\[(.*?)\]', question)
if match:
return match.group(1).strip().lower()
else:
return None
def split_answers(answer_str):
"""
Splits the answer string by '|' and trims whitespace.
"""
return [ans.strip().lower() for ans in answer_str.split('|')]
def validate_triplet(G, entity1, relation, entity2, debug=False):
"""
Validates whether a triplet (entity1, relation, entity2) exists in the knowledge graph.
Args:
G (networkx.Graph): The knowledge graph.
entity1 (str): Source entity.
relation (str): Relation name.
entity2 (str): Target entity.
debug (bool): If True, print debug statements.
Returns:
bool: True if the triplet exists, False otherwise.
"""
if debug:
print(f"Validating triplet: ({entity1}, {relation}, {entity2})")
if G.has_edge(entity1, entity2):
edge_attrs = G.get_edge_data(entity1, entity2)
if G.is_multigraph():
# For MultiGraphs, check if any edge has the specified relation
for key, attrs in edge_attrs.items():
if attrs.get('relation', '').strip().lower() == relation.lower():
if debug:
print(f" Triplet exists via edge {key}: ({entity1}, {relation}, {entity2})")
return True
else:
# For simple graphs, check the relation attribute
rel_attr = edge_attrs.get('relation', '').strip().lower()
if rel_attr == relation.lower():
if debug:
print(f" Triplet exists: ({entity1}, {relation}, {entity2})")
return True
if debug:
print(f" Triplet does NOT exist: ({entity1}, {relation}, {entity2})")
return False
def find_paths_with_relations(G, start, end, relation_path, debug=False):
"""
Given a relation path (e.g., 'writer_to_movie_to_genre'), maps it to knowledge graph relations,
traverses the KG starting from 'start', follows the relations in sequence, and finds paths to 'end'.
Args:
G (networkx.Graph): The knowledge graph.
start (str): The starting entity.
end (str): The target entity.
relation_path (str): The relation path from the txt file (e.g., 'writer_to_movie_to_genre').
debug (bool): If True, print debug statements.
Returns:
List of tuples: Each tuple contains (path, relation_sequence)
"""
# Split the relation path into transitions
parts = relation_path.split('_to_') # ['writer', 'movie', 'genre']
transitions = list(zip(parts[:-1], parts[1:])) # [('writer', 'movie'), ('movie', 'genre')]
# Map transitions to relations using the provided mapping
relations = []
for src, tgt in transitions:
relation = TRANSITION_TO_RELATION.get((src.lower(), tgt.lower()))
if not relation:
if debug:
print(f" Warning: No relation mapping found for transition ({src}, {tgt}). Skipping path.")
return [] # Invalid path due to missing mapping
relations.append(relation)
# Define the relation sequence
relation_sequence = relations # e.g., ['written_by_reversed', 'has_genre']
# Initialize list to collect paths
paths_rel = []
# Initialize stack for DFS: (current_node, current_path, relations_left)
stack = [(start, [start], relation_sequence.copy())]
while stack:
current_node, current_path, relations_left = stack.pop()
if debug:
print(f"Traversing from '{current_node}' with relations left: {relations_left}")
if not relations_left:
if current_node == end:
# Collect the relation sequence
relation_seq = tuple(current_path[i] for i in range(1, len(current_path), 2))
paths_rel.append((current_path.copy(), relation_seq))
if debug:
print(f" Found valid path: {current_path}")
continue
next_relation = relations_left[0]
if debug:
print(f" Next relation to traverse: '{next_relation}'")
actual_relation = next_relation
# Traverse based on direction
neighbors = G.successors(current_node)
for neighbor in neighbors:
if G.has_edge(current_node, neighbor):
edge_attrs = G.get_edge_data(current_node, neighbor)
# If current_node is the same as neighbor, add reversed edge attributes
if current_node == neighbor:
reversed_edge_attrs = {}
for key, attrs in edge_attrs.items():
# Create a new key with '_reversed' appended
new_key = f"{key}_reversed"
# Create a new attributes dictionary with 'relation' modified
new_attrs = {}
for attr_key, attr_value in attrs.items():
if attr_key == 'relation' and isinstance(attr_value, str):
new_attrs[attr_key] = f"{attr_value}_reversed"
else:
new_attrs[attr_key] = attr_value # Keep other attributes unchanged
# Add the modified key and attributes to the reversed_edge_attrs dictionary
reversed_edge_attrs[new_key] = new_attrs
# Merge the reversed_edge_attrs into the original edge_attrs
edge_attrs.update(reversed_edge_attrs)
# Debugging output
if debug:
print(f"edge_attrs = {edge_attrs}")
print(f"actual_relation = {actual_relation}")
else:
continue # No such edge
if G.is_multigraph():
# Check if any edge has the required relation
if any(attrs.get('relation', '').strip().lower() == actual_relation.lower() for attrs in edge_attrs.values()):
# Find the specific edge
for key, attrs in edge_attrs.items():
if attrs.get('relation', '').strip().lower() == actual_relation.lower():
if debug:
print(f" Current Node: '{current_node}', Neighbor: '{neighbor}', Relation: '{next_relation}'")
# Handle self-loop: if current_node == neighbor and relation is reversed, use forward relation
if current_node == neighbor and next_relation.endswith('_reversed'):
adjusted_relation = actual_relation
#print(f" Adjusting relation for self-loop from '{next_relation}' to '{adjusted_relation}'")
else:
adjusted_relation = actual_relation
# Append relation and neighbor
current_path_extended = current_path.copy()
current_path_extended.append(adjusted_relation)
current_path_extended.append(neighbor)
if debug:
print(f" Extending path: {current_path_extended}")
# For incoming, triplet is (neighbor, relation, current_node)
if validate_triplet(G, current_node, adjusted_relation, neighbor, debug=debug):
stack.append((neighbor, current_path_extended, relations_left[1:].copy()))
break # Only need to find one matching edge
return paths_rel
def process_questions(question_answer_file, relation_path_file, G, n):
"""
Processes each question-answer pair, reads the corresponding relation path from the txt file,
maps the relations, validates triplets in the knowledge graph, and constructs evidences.
Args:
question_answer_file (str): Path to the TSV file containing questions and answers.
relation_path_file (str): Path to the TXT file containing relation paths per question.
G (networkx.Graph): The knowledge graph.
n (int): The exact number of relations (hops) to consider.
Returns:
pandas.DataFrame: DataFrame containing the question, answers, and evidences.
"""
# Read the relation path file
with open(relation_path_file, 'r') as f:
relation_paths = f.read().splitlines()
# Read the question-answer file
df = pd.read_csv(question_answer_file, sep='\t', header=None, names=['question', 'answer'])
# Check if the number of questions matches the number of relation paths
if len(df) != len(relation_paths):
raise ValueError("The number of questions and relation paths do not match.")
# Lists to store DataFrame columns
questions = []
answers_list = []
evidences_list = []
# Iterate over each row in the DataFrame along with the corresponding relation path
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing Questions"):
question = row['question']
answer_str = row['answer'].lower()
relation_path = relation_paths[index].strip() # e.g., 'writer_to_movie_to_genre'
# Extract the starting entity from the question
start_entity = extract_starting_entity(question)
if not start_entity:
print(f" No starting entity found in question: '{question}'. Skipping.\n")
continue # Skip to the next row
# Split the answers by '|'
answers = split_answers(answer_str)
if not answers:
print(f" No answers found for question: '{question}'. Skipping.\n")
continue # Skip if no answers
# Determine if debugging should be enabled
debug = False#(start_entity == 'malcolm x')
# For each answer, find the path based on the relation path
evidences = []
all_valid = True # Flag to ensure all answers have valid paths
for answer in answers:
# Find the path using the relation path
paths_rel = find_paths_with_relations(G, start_entity, answer, relation_path, debug=debug)
if not paths_rel:
print(f" No valid path found for answer '{answer}' with relation path '{relation_path}'.")
all_valid = False
break # Skip this question if any answer lacks a valid path
else:
# Since the relation path is predefined, we expect only one path
path, relation_seq = paths_rel[0]
evidences.append(path) # Append the full path including entities and relations
if not all_valid:
print(f" Skipping question due to missing paths for some answers: '{question}'.\n")
continue # Skip adding this question
# Append data to lists
questions.append(question)
answers_list.append(answers)
evidences_list.append(evidences)
# Create the DataFrame
results_df = pd.DataFrame({
'question': questions,
'answers': answers_list,
'evidences': evidences_list
})
return results_df
def display_outgoing_relations(G, entity):
"""
Displays all outgoing relations from a specified entity.
Args:
G (networkx.Graph): The knowledge graph.
entity (str): The entity to inspect.
"""
if G.has_node(entity):
print(f"Outgoing relations from '{entity}':")
for neighbor in G.neighbors(entity):
edge_attrs = G.get_edge_data(entity, neighbor)
if G.is_multigraph():
for key, attrs in edge_attrs.items():
print(f" --[{attrs.get('relation', 'no_relation')}]--> {neighbor}")
else:
print(f" --[{edge_attrs.get('relation', 'no_relation')}]--> {neighbor}")
else:
print(f"Entity '{entity}' does not exist in the knowledge graph.")