pbh
pbh

Reputation: 452

Python Code failing : dedupe library error

I am trying to learn about dedupe library . I am trying to match name which are more than 80% match.

Sharing code and error . Please help

import dedupe
from Levenshtein import distance

def test():


    # Sample data (replace with your actual library data)
    data = [
        {'name': 'Alice Smith', 'address': '123 Main St', 'phone': '555-1212'},
        {'name': 'Alice SmIth', 'address': '123 Main Street', 'phone': '555-1213'},
        {'name': 'Bob Johnson', 'address': '456 Elm St', 'phone': '555-3434'},
        {'name': 'Charlie Brown', 'address': '789 Maple Ave', 'phone': '555-5656'},
    ]

    # Define fields for comparison (adjust based on your data)
    # Define data fields and comparison functions
    fields = [
        {'field': 'name', 'comparators': ['name_similarity']},

    ]

    # Define similarity functions - customize based on your matching criteria
    def name_similarity(s1, s2):
        # Implement your name comparison logic here (e.g., Levenshtein distance, etc.)
        distance1 = distance(s1, s2)
        similarity = 1 - (distance1 / max(len(s1), len(s2)))  # Normalize distance to 0-1 similarity
        return similarity



    # Set thresholds for field-wise and overall similarity (adjust as needed)
    deduper = dedupe.Dedupe(fields)
    deduper.threshold( threshold=0.8)

    # Process the data for deduplication
    deduped_data = deduper.dedupe(data)

    # Print the deduplicated results
    print("Deduplicated Data:")
    for cluster in deduped_data:
        print(cluster)


if __name__ == '__main__':
    test()

.....

C:\PythonProject\pythonProject\venv\Graph_POC\Scripts\python.exe C:\PythonProject\pythonProject\matching.py  Traceback (most recent call last):   File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 152, in typify_variables
    variable_type = definition["type"]
                    ~~~~~~~~~~^^^^^^^^ KeyError: 'type'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):   File "C:\PythonProject\pythonProject\matching.py", line 45, in <module>
    test()   File "C:\PythonProject\pythonProject\matching.py", line 32, in test
    deduper = dedupe.Dedupe(fields)
              ^^^^^^^^^^^^^^^^^^^^^   File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\api.py", line 1155, in __init__
    self.data_model = datamodel.DataModel(variable_definition)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^   File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 42, in __init__
    self.primary_variables, all_variables = typify_variables(variable_definitions)
                                            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^   File "C:\PythonProject\pythonProject\venv\Graph_POC\Lib\site-packages\dedupe\datamodel.py", line 161, in typify_variables
    raise KeyError( KeyError: "Missing variable type: variable specifications are dictionaries that must include a type definition, ex. {'field' : 'Phone', type: 'String'}"

Process finished with exit code 1

Upvotes: 1

Views: 96

Answers (1)

Johnny Cheesecutter
Johnny Cheesecutter

Reputation: 2853

My answer is for dedupe>=2.0 and the code may not work on other library versions.

Also I would suggest to check this manuals, they have a lot of usefull information: https://dedupeio.github.io/dedupe-examples/docs/csv_example.html

  1. Back to your error - when intializing dedupe.Dedupe class it is expecting from you to pass information about field types (see fields variable in the code below).

  2. Also your data should be formatted a bit different, each row should have an id. Check variable data_d in the code below.

  3. One more thing that you are missing - list of labeled examples. Basically several matching pairs and several distinct pairs (better to have at least 10 pairs in each group). They are stores in labeled_example variable in the code.

  4. Lastly if you plan to use custom comparators - you should change field_type to Custom. Otherwise dedupe will use standard comparator based on field type (String/Category/Float etc.).

check this link for all available variable definitions:

https://docs.dedupe.io/en/latest/Variable-definition.html

import dedupe
from Levenshtein import distance

# Define similarity functions - customize based on your matching criteria
def name_similarity(s1, s2):
    # Implement your name comparison logic here (e.g., Levenshtein distance, etc.)
    distance1 = distance(s1, s2)
    similarity = 1 - (distance1 / max(len(s1), len(s2)))  # Normalize distance to 0-1 similarity
    return similarity

def test():
    
    # Sample data (replace with your actual library data)
    data = [
        {'name': 'Alice Smith', 'address': '123 Main St', 'phone': '555-1212'},
        {'name': 'Alice SmIth', 'address': '123 Main Street', 'phone': '555-1213'},
        
        {'name': 'Bob Johnson', 'address': '456 Elm St', 'phone': '555-3434'},
        {'name': 'Bob Johnson', 'address': '457 Elm St', 'phone': '555-3434'},
        
        {'name': 'Charlie Brown', 'address': '789 Maple Ave', 'phone': '555-5656'},
        {'name': 'Charlie Brown', 'address': '789 Meple Ave', 'phone': '555-5656'},
    
        {'name': 'Karry Perry', 'address': '102 Meple Ave', 'phone': '555-3556'},
        {'name': 'Karri Perry', 'address': '102 Meple Ave', 'phone': '555-3556'},
    
    ]

    # give examples of matched Rows and distinct Rows
    labeled_examples = {
        "match": [(data[0], data[1]),
                  (data[2], data[3]),
                  (data[4], data[5])],
        "distinct": [(data[0], data[2]),
                  (data[2], data[4]),
                  (data[4], data[6])],
    }
    
    data_d = {i:record for i, record in enumerate(data)}
    
    
    # Define fields for comparison (adjust based on your data)
    # Define data fields and comparison functions
    fields = [
        {'field': 'name', 'type': 'Custom', 'comparator': name_similarity},
        # {'field': 'address', 'type': 'String'},
        {'field': 'phone', 'type': 'String'},
    
    ]
        
    deduper = dedupe.Dedupe(fields)
    deduper.prepare_training(data_d)    
    deduper.mark_pairs(labeled_examples)


    # # # !! This line required to run this example - very small dataset
    # # # !! Remove it if your dataset have more than 10 examples in each class
    # # # this is GridSearchCV.cv parameter for creating KFold
    deduper.classifier.cv = 2

    # # # train
    _ = deduper.train()

    clustered_dupes = deduper.partition(data_d, threshold=0.5)
    
    print('# duplicate sets', len(clustered_dupes))
    
    cluster_membership = {}
    for cluster_id, (records, scores) in enumerate(clustered_dupes):
        for record_id, score in zip(records, scores):
            cluster_membership[record_id] = {
                "Cluster ID": cluster_id,
                "confidence_score": score
            }
    
    print(cluster_membership)



if __name__ == '__main__':
    test()

output:

{0: {'Cluster ID': 0, 'confidence_score': 0.82329434}, 
 1: {'Cluster ID': 0, 'confidence_score': 0.82329434}, 
 2: {'Cluster ID': 1, 'confidence_score': 0.96056044}, 
 3: {'Cluster ID': 1, 'confidence_score': 0.96056044}, 
 4: {'Cluster ID': 2, 'confidence_score': 0.96056044}, 
 5: {'Cluster ID': 2, 'confidence_score': 0.96056044}, 
 6: {'Cluster ID': 3, 'confidence_score': 0.9537174}, 
 7: {'Cluster ID': 3, 'confidence_score': 0.9537174}}

Upvotes: 0

Related Questions