Skip to content

Commit

Permalink
pyspark scripts to preprocess amazon-google data
Browse files Browse the repository at this point in the history
  • Loading branch information
navinrathore committed Jan 14, 2022
1 parent 54a41be commit f14e405
Show file tree
Hide file tree
Showing 5 changed files with 514 additions and 51 deletions.
112 changes: 61 additions & 51 deletions examples/amazon-google/config.json
Original file line number Diff line number Diff line change
@@ -1,72 +1,82 @@
{
"fieldDefinition":[
{
"fieldDefinition": [
{
"fieldName" : "title",
"matchType" : "text",
"fields" : "title",
"dataType": "\"string\""
"fieldName": "id",
"matchType": "DONT_USE",
"fields": "id",
"dataType": "\"string\""
},
{
"fieldName" : "manufacturer",
"fieldName": "title",
"matchType": "text",
"fields": "title",
"dataType": "\"string\""
},
{
"fieldName": "manufacturer",
"matchType": "fuzzy",
"fields" : "manufacturer",
"dataType": "\"string\""
"fields": "manufacturer",
"dataType": "\"string\""
},
{
"fieldName" : "price",
"fieldName": "price",
"matchType": "fuzzy",
"fields" : "price",
"dataType": "\"double\""
}],
"output" : [{
"name":"output",
"format":"csv",
"fields": "price",
"dataType": "\"double\""
}
],
"output": [
{
"name": "output",
"format": "csv",
"props": {
"location": "/tmp/zinggOutput",
"delimiter": ",",
"header":false
"header": false
}
}],
"data" : [{
"name":"test1",
"format":"csv",
"props": {
"location": "examples/Amazon-Google/dataset_orig/tableA.csv",
"delimiter": ",",
"header":false
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
},
}
],
"data": [
{
"name":"test2",
"format":"csv",
"name": "test1",
"format": "csv",
"props": {
"location": "examples/Amazon-Google/dataset_orig/tableB.csv",
"location": "examples/amazon-google/dataset_orig/tableA.csv",
"delimiter": ",",
"header":false
"header": false
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true},
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
}
],
"labelDataSampleSize" : 0.4,
"numPartitions":4,
"modelId": 1100,
"zinggDir": "models",
"collectMetrics": false
}
]
}"
},
{
"name": "test2",
"format": "csv",
"props": {
"location": "examples/amazon-google/dataset_orig/tableB.csv",
"delimiter": ",",
"header": false
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true},
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
}
],
"labelDataSampleSize": 0.4,
"numPartitions": 4,
"modelId": 1101,
"zinggDir": "models",
"collectMetrics": false
}
78 changes: 78 additions & 0 deletions examples/amazon-google/configLink.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"fieldDefinition":[
{
"fieldName" : "id",
"matchType" : "DONT_USE",
"fields" : "id",
"dataType": "\"string\""
},
{
"fieldName" : "title",
"matchType" : "text",
"fields" : "title",
"dataType": "\"string\""
},
{
"fieldName" : "manufacturer",
"matchType": "fuzzy",
"fields" : "manufacturer",
"dataType": "\"string\""
},
{
"fieldName" : "price",
"matchType": "fuzzy",
"fields" : "price",
"dataType": "\"double\""
}],
"output" : [{
"name":"output",
"format":"csv",
"props": {
"location": "/tmp/zinggOutput",
"delimiter": ",",
"header":false
}
}],
"data" : [{
"name":"testA",
"format":"csv",
"props": {
"location": "examples/amazon-google/testA.csv",
"delimiter": ",",
"header":true
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
},
{
"name":"testB",
"format":"csv",
"props": {
"location": "examples/amazon-google/testB.csv",
"delimiter": ",",
"header":true
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
}
],
"labelDataSampleSize" : 0.4,
"numPartitions":4,
"modelId": 1101,
"zinggDir": "models",
"collectMetrics": false
}
102 changes: 102 additions & 0 deletions examples/amazon-google/configWithTrainingSamples.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
{
"trainingSamples" : [{
"name":"trainingPos",
"format":"csv",
"props": {
"location": "examples/amazon-google/training.csv",
"delimiter": ",",
"header":false,
"badRecordsPath":"/tmp/bad"
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [

{\"name\" : \"z_cluster\",\"type\" : \"string\",\"nullable\" : false, \"metadata\" : {}},
{\"name\":\"z_ismatch\",\"type\":\"integer\",\"nullable\":true,\"metadata\" : {}},
{\"name\":\"id\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}

]
}"

}],
"fieldDefinition":[
{
"fieldName" : "id",
"matchType" : "DONT_USE",
"fields" : "id",
"dataType": "\"string\""
},
{
"fieldName" : "title",
"matchType" : "text",
"fields" : "title",
"dataType": "\"string\""
},
{
"fieldName" : "manufacturer",
"matchType": "fuzzy",
"fields" : "manufacturer",
"dataType": "\"string\""
},
{
"fieldName" : "price",
"matchType": "exact",
"fields" : "price",
"dataType": "\"double\""
}],
"output" : [{
"name":"output",
"format":"csv",
"props": {
"location": "/tmp/zinggOutput",
"delimiter": ",",
"header":false
}
}],
"data" : [{
"name":"test1",
"format":"csv",
"props": {
"location": "examples/amazon-google/dataset_orig/tableA.csv",
"delimiter": ",",
"header":true
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
},
{
"name":"test2",
"format":"csv",
"props": {
"location": "examples/amazon-google/dataset_orig/tableB.csv",
"delimiter": ",",
"header":true
},
"schema":
"{\"type\" : \"struct\",
\"fields\" : [
{\"name\":\"id\", \"type\":\"string\", \"nullable\":false},
{\"name\":\"title\", \"type\":\"string\", \"nullable\":true},
{\"name\":\"manufacturer\",\"type\":\"string\",\"nullable\":true} ,
{\"name\":\"price\", \"type\":\"double\", \"nullable\":true}
]
}"
}
],
"labelDataSampleSize" : 0.4,
"numPartitions":4,
"modelId": 1101,
"zinggDir": "models",
"collectMetrics": false
}
Loading

0 comments on commit f14e405

Please sign in to comment.