From 8fa6c6f9b044a4447d51b6fe3ec5dda237bf89bb Mon Sep 17 00:00:00 2001 From: SuchandraDatta Date: Sat, 19 Nov 2022 18:34:14 +0530 Subject: [PATCH 1/2] python postgres entity resolution example --- examples/febrl/postgres.py | 63 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 examples/febrl/postgres.py diff --git a/examples/febrl/postgres.py b/examples/febrl/postgres.py new file mode 100644 index 000000000..cd6be877e --- /dev/null +++ b/examples/febrl/postgres.py @@ -0,0 +1,63 @@ +from zingg.client import * +from zingg.pipes import * +import sys + +#build the arguments for zingg +args = Arguments() + +#phase name to be passed as a command line argument +phase_name = sys.argv[1] + +#set field definitions +fname = FieldDefinition("fname", "string", MatchType.FUZZY) +lname = FieldDefinition("lname", "string", MatchType.FUZZY) +streetnumber = FieldDefinition("streetnumber", "string", MatchType.FUZZY) +street = FieldDefinition("street","string", MatchType.FUZZY) +address = FieldDefinition("address", "string", MatchType.FUZZY) +locality = FieldDefinition("locality", "string", MatchType.FUZZY) +areacode = FieldDefinition("areacode", "string", MatchType.FUZZY) +state = FieldDefinition("state", "string", MatchType.FUZZY) +dateofbirth = FieldDefinition("dateofbirth", "string", MatchType.FUZZY) +ssn = FieldDefinition("ssn", "string", MatchType.FUZZY) + +fieldDefs = [fname, lname, streetnumber, street, address, locality, areacode, state, dateofbirth, ssn] + +args.setFieldDefinition(fieldDefs) + +#defining input pipe +customerDataStaging = Pipe("test", "jdbc") +customerDataStaging.addProperty("url","jdbc:postgresql://localhost:5432/postgres") +customerDataStaging.addProperty("dbtable", "customers") +customerDataStaging.addProperty("driver", "org.postgresql.Driver") +customerDataStaging.addProperty("user","suchandra") +customerDataStaging.addProperty("password","1234") + +#add input pipe to arguments for Zingg client +args.setData(customerDataStaging) + +#defining output pipe +customerIdentitiesResoled = Pipe("test", "jdbc") +customerIdentitiesResoled.addProperty("url","jdbc:postgresql://localhost:5432/postgres") +customerIdentitiesResoled.addProperty("dbtable", "customers_unified") +customerIdentitiesResoled.addProperty("driver", "org.postgresql.Driver") +customerIdentitiesResoled.addProperty("user","suchandra") +customerIdentitiesResoled.addProperty("password","1234") + +#add output pipe to arguments for Zingg client +args.setOutput(customerIdentitiesResoled) + +#save latest model in directory models/customer360 +args.setModelId("customer360") +#store all models in directory models/ +args.setZinggDir("models") +#sample size for selecting data for labelling +args.setNumPartitions(4) +#fraction of total dataset to select data for labelling +args.setLabelDataSampleSize(0.5) + + +options = ClientOptions([ClientOptions.PHASE,phase_name]) + +#Zingg execution for the given phase +zingg = Zingg(args, options) +zingg.initAndExecute() \ No newline at end of file From 90f6db6e0254ae82372d2e846ce05c6aba2ede9f Mon Sep 17 00:00:00 2001 From: SuchandraDatta Date: Sun, 20 Nov 2022 19:57:38 +0530 Subject: [PATCH 2/2] python postgres example --- examples/febrl/postgres.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/febrl/postgres.py b/examples/febrl/postgres.py index cd6be877e..000d5b527 100644 --- a/examples/febrl/postgres.py +++ b/examples/febrl/postgres.py @@ -25,7 +25,7 @@ args.setFieldDefinition(fieldDefs) #defining input pipe -customerDataStaging = Pipe("test", "jdbc") +customerDataStaging = Pipe("customerDataStaging", "jdbc") customerDataStaging.addProperty("url","jdbc:postgresql://localhost:5432/postgres") customerDataStaging.addProperty("dbtable", "customers") customerDataStaging.addProperty("driver", "org.postgresql.Driver") @@ -36,15 +36,15 @@ args.setData(customerDataStaging) #defining output pipe -customerIdentitiesResoled = Pipe("test", "jdbc") -customerIdentitiesResoled.addProperty("url","jdbc:postgresql://localhost:5432/postgres") -customerIdentitiesResoled.addProperty("dbtable", "customers_unified") -customerIdentitiesResoled.addProperty("driver", "org.postgresql.Driver") -customerIdentitiesResoled.addProperty("user","suchandra") -customerIdentitiesResoled.addProperty("password","1234") +customerIdentitiesResolved = Pipe("customerIdentitiesResolved", "jdbc") +customerIdentitiesResolved.addProperty("url","jdbc:postgresql://localhost:5432/postgres") +customerIdentitiesResolved.addProperty("dbtable", "customers_unified") +customerIdentitiesResolved.addProperty("driver", "org.postgresql.Driver") +customerIdentitiesResolved.addProperty("user","suchandra") +customerIdentitiesResolved.addProperty("password","1234") #add output pipe to arguments for Zingg client -args.setOutput(customerIdentitiesResoled) +args.setOutput(customerIdentitiesResolved) #save latest model in directory models/customer360 args.setModelId("customer360")