TranscriptResolver/normalization_tools.py at master · junyinglim/TranscriptResolver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
## NORMALIZATION TOOLS
# Description: A set of tools for fuzzy matching crowd-sourced verbatim to refernece libraries
# Author: Junying Lim
# Date: 21th Sept 2015

# Notes:
# Developed during the CITScribe hackathon organized by iDigBio, Gainsville, Florida (15 Dec - 20 Dec 2013)
# For more information on using MAFFT for string matching http://mafft.cbrc.jp/alignment/software/textcomparison.html


## DEPENDENCIES
import pandas as pd
from collections import defaultdict
import itertools as it
from fuzzywuzzy import fuzz # Fuzzy string matching for best_transcript()
from fuzzywuzzy import process # For reflst matching
from Levenshtein import * # Levenshtein distance for best_transcript()

import os # Path tools
import string # String tools
import nltk # For tokenizing
import subprocess # For subprocessing MAFFT
import re # Regular expressions


def fill_pd(x, pd):
    ''' Populates a pandas.DataFrame by row

        Args:
            x : list of lists (each element of those lists)
            pd: empty pandas data frame

        Returns:
            A pandas.DataFrame

    '''
    row = 0
    for i in x:
        col = 0

        if len(i) > len(pd.columns):
            print("There were more entries than columns, will leave row blank")
            pd.loc[row, col] = ""

        # Else fill it up as per normal
        else:
            for j in i:
                pd.loc[row, list(pd.columns.values)[col]] = str(j)
                col += 1

        row += 1

    return pd.fillna("")


def refcheck(datalst, reflst, threshold):
    ''' Uses fuzzy string matching to identify the most likely name from a reference list

        Args:
            datalst     : list, contains names to check
            reflst      : list, contains list of possible names
            threshold   : threshold similarity before a name on reference list is considered likely. If 1, then matches must be exact.

        Returns:
            a tuple of 2 lists - one list of the best matches, and the second of certainty values

    '''
    estimate = list()
    score = list()

    counter = 0
    for x in datalst:
        #if counter % 100 == 0:
         #       print(counter, "of", len(datalst), "entries normalized.")

    # If empty, then return empty
        if x == "":
            estimate.append("")
            score.append("NA")
            counter += 1

    # Else, find the most similar collector name from the whole reference list
        else:
            temp = [ratio(x, y) for y in reflst]
            max_ratio = max(temp)

            if max_ratio < threshold:
                estimate.append("")
                score.append(max_ratio)

            else:
                estimate.append(reflst[temp.index(max_ratio)])
                score.append(max_ratio)

            counter += 1

    return (estimate, score)


def reflist_check(datalst, reflst, threshold):

    ''' Uses fuzzy string matching to identify the most likely name from a reference list

        Args:
            datalst     : list, contains names to check
            reflst      : list, contains list of possible names

        Returns:
            a tuple containing a list of best matches, and certainty values

    >>> x = ["E.G.Lyndsey"]
    >>> y = ["Gordon", "Lyndsey", "E.G.Lyndsey"]
    >>> reflist_check(x,y)

    '''
    # Create empty lists
    estimate = ["NA"] * len(datalst)
    certainty = ["NA"] * len(datalst)
    reflistkeys = [key.decode('latin-1') for key in reflst.keys()]

    counter = 0
    for x in datalst:

        # Print counter (the function takes a long time)
        if counter % 100 == 0:
            print(counter, "of", len(datalst), "entries normalized.")

        if x == "":
            estimate[counter] = ""
            certainty[counter] = "NA"
            counter += 1

        else:

            temp = process.extractOne(x, reflistkeys)

            if temp[1] < threshold:
                estimate[counter] = ""
                certainty[counter] = temp[1]

            else:
                estimate[counter] = reflst[temp[0]]
                certainty[counter] = temp[1]
            counter += 1

    return (estimate, certainty)