Reputation: 91
I have the following JSON from which I need to extract the full_address_bucket__
structures (present in ['hits']['hits']['_source']
) into a Spark dataframe with Schema of full_address_bucket__
preserved using pyspark library
{
"took" : 436,
"timed_out" : false,
"_shards" : {
"total" : 3,
"successful" : 3,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1194393,
"max_score" : 1.0,
"hits" : [
{
"_index" : "epf_0001",
"_type" : "document",
"_id" : "PUKOL1964690000",
"_score" : 1.0,
"_source" : {
"full_address_" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
"establishment_id" : "PUKOL1964690000",
"full_address_bucket__" : {
"care_of" : null,
"adrs_long" : "gat no 329 , tardal, hatkanangle, kolhapur, 416109 , |,",
"address" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
"city" : "kolhapur",
"building_soundex" : null,
"locality" : null,
"complex_soundex" : null,
"house" : null,
"building" : null,
"is_adrs_good" : true,
"adrs_long_soundex" : "KT N TRTL HTKNNKL KLHPR",
"pin" : "416109",
"untagged" : "gat no 329, tardal, hatkanangle",
"street" : null,
"district" : "kolhapur",
"complex" : null,
"house_unsure" : "329",
"state" : "maharashtra",
"floor" : null,
"landmark" : null
}
}
},
{
"_index" : "epf_0001",
"_type" : "document",
"_id" : "PUPUN0002003000",
"_score" : 1.0,
"_source" : {
"full_address_" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
"establishment_id" : "PUPUN0002003000",
"full_address_bucket__" : {
"care_of" : null,
"adrs_long" : "tal - lonavala, 228 , gram - ravivarma taluk - lonavala, 228 , gram - ravivarma, malvali, 410405 , |,",
"address" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
"city" : "lonavala",
"building_soundex" : null,
"locality" : null,
"complex_soundex" : null,
"house" : null,
"building" : null,
"is_adrs_good" : true,
"adrs_long_soundex" : "TL LNFL KRM RFFRM TLK LNFL KRM RFFRM MLFL",
"pin" : "410405",
"untagged" : "tal - |, 228, gram - ravivarma - |, 228, -, malvali",
"street" : null,
"district" : "pune",
"complex" : null,
"house_unsure" : "228 228",
"state" : "maharashtra",
"floor" : null,
"landmark" : null
}
}
},
{
"_index" : "epf_0001",
"_type" : "document",
"_id" : "PUPUN0124020000",
"_score" : 1.0,
"_source" : {
"full_address_" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
"establishment_id" : "PUPUN0124020000",
"full_address_bucket__" : {
"care_of" : null,
"adrs_long" : "plot no . 9 survey no 105 - 108 . plot no . 9 survey no 105 - 108 ., pune, 411013 , |,",
"address" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
"city" : "pune",
"building_soundex" : null,
"locality" : null,
"complex_soundex" : null,
"house" : "plot no . 9| survey no 105 - 108 . | plot no . 9| survey no 105 - 108",
"building" : null,
"is_adrs_good" : true,
"adrs_long_soundex" : "PLT N SRF N PLT N SRF N PN",
"pin" : "411013",
"untagged" : null,
"street" : null,
"district" : "pune",
"complex" : null,
"house_unsure" : null,
"state" : "maharashtra",
"floor" : null,
"landmark" : null
}
}
}
]
}
}
Upvotes: 0
Views: 202
Reputation: 520
If you don't want to use the json as string and pull it from a location use the below code. It will directly create a Dataframe for full_address_bucket__
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("Test").getOrCreate()
df = spark.read.format("json").option("inferSchema", "true").load("test14.json").select("hits.*")\
.select(F.explode("hits").alias('hits')).select('hits.*').select("_source.*").select("full_address_bucket__.*")
df.show()
Output:
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+
| address| adrs_long| adrs_long_soundex|building|building_soundex|care_of| city|complex|complex_soundex|district|floor| house|house_unsure|is_adrs_good|landmark|locality| pin| state|street| untagged|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+
|Gat No 329, Tarda...|gat no 329 , tard...|KT N TRTL HTKNNKL...| null| null| null|kolhapur| null| null|kolhapur| null| null| 329| true| null| null|416109|maharashtra| null|gat no 329, tarda...|
|TAL-LONAVALA,228,...|tal - lonavala, 2...|TL LNFL KRM RFFRM...| null| null| null|lonavala| null| null| pune| null| null| 228 228| true| null| null|410405|maharashtra| null|tal - |, 228, gra...|
|PLOT NO.9 S.NO. 1...|plot no . 9 surve...|PLT N SRF N PLT N...| null| null| null| pune| null| null| pune| null|plot no . 9| surv...| null| true| null| null|411013|maharashtra| null| null|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+--------------------+------------+------------+--------+--------+------+-----------+------+--------------------+
Upvotes: 1
Reputation: 520
import json
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Test").getOrCreate()
json_data = """{
"took" : 436,
"timed_out": false,
"_shards" : {
"total" : 3,
"successful" : 3,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 1194393,
"max_score" : 1.0,
"hits" : [
{
"_index" : "epf_0001",
"_type" : "document",
"_id" : "PUKOL1964690000",
"_score" : 1.0,
"_source" : {
"full_address_" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
"establishment_id" : "PUKOL1964690000",
"full_address_bucket__" : {
"care_of" : null,
"adrs_long" : "gat no 329 , tardal, hatkanangle, kolhapur, 416109 , |,",
"address" : "Gat No 329, Tardal,Hatkanangle,KOLHAPUR,416109,MAHARASHTRA,India",
"city" : "kolhapur",
"building_soundex" : null,
"locality" : null,
"complex_soundex" : null,
"house" : null,
"building" : null,
"is_adrs_good" : true,
"adrs_long_soundex" : "KT N TRTL HTKNNKL KLHPR",
"pin" : "416109",
"untagged" : "gat no 329, tardal, hatkanangle",
"street" : null,
"district" : "kolhapur",
"complex" : null,
"house_unsure" : "329",
"state" : "maharashtra",
"floor" : null,
"landmark" : null
}
}
},
{
"_index" : "epf_0001",
"_type" : "document",
"_id" : "PUPUN0002003000",
"_score" : 1.0,
"_source" : {
"full_address_" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
"establishment_id" : "PUPUN0002003000",
"full_address_bucket__" : {
"care_of" : null,
"adrs_long" : "tal - lonavala, 228 , gram - ravivarma taluk - lonavala, 228 , gram - ravivarma, malvali, 410405 , |,",
"address" : "TAL-LONAVALA,228,GRAM-RAVIVARMA TAL-LONAVALA,228,GRAM-RAVIVARMA,MALVALI,,410405,MAHARASHTRA,India",
"city" : "lonavala",
"building_soundex" : null,
"locality" : null,
"complex_soundex" : null,
"house" : null,
"building" : null,
"is_adrs_good" : true,
"adrs_long_soundex" : "TL LNFL KRM RFFRM TLK LNFL KRM RFFRM MLFL",
"pin" : "410405",
"untagged" : "tal - |, 228, gram - ravivarma - |, 228, -, malvali",
"street" : null,
"district" : "pune",
"complex" : null,
"house_unsure" : "228 228",
"state" : "maharashtra",
"floor" : null,
"landmark" : null
}
}
},
{
"_index" : "epf_0001",
"_type" : "document",
"_id" : "PUPUN0124020000",
"_score" : 1.0,
"_source" : {
"full_address_" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
"establishment_id" : "PUPUN0124020000",
"full_address_bucket__" : {
"care_of" : null,
"adrs_long" : "plot no . 9 survey no 105 - 108 . plot no . 9 survey no 105 - 108 ., pune, 411013 , |,",
"address" : "PLOT NO.9 S.NO. 105-108. PLOT NO.9 S.NO. 105-108.,PUNE,,411013,MAHARASHTRA,India",
"city" : "pune",
"building_soundex" : null,
"locality" : null,
"complex_soundex" : null,
"house" : "plot no . 9| survey no 105 - 108 . | plot no . 9| survey no 105 - 108",
"building" : null,
"is_adrs_good" : true,
"adrs_long_soundex" : "PLT N SRF N PLT N SRF N PN",
"pin" : "411013",
"untagged" : null,
"street" : null,
"district" : "pune",
"complex" : null,
"house_unsure" : null,
"state" : "maharashtra",
"floor" : null,
"landmark" : null
}
}
}
]
}
}"""
json_obj = json.loads(json_data)
json_data1 = json_obj['hits']['hits'][0]['_source']['full_address_bucket__']
json_new = [json.dumps(json_data1)]
df = spark.read.json(spark.sparkContext.parallelize(json_new))
df.show()
Output:
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+
| address| adrs_long| adrs_long_soundex|building|building_soundex|care_of| city|complex|complex_soundex|district|floor|house|house_unsure|is_adrs_good|landmark|locality| pin| state|street| untagged|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+
|Gat No 329, Tarda...|gat no 329 , tard...|KT N TRTL HTKNNKL...| null| null| null|kolhapur| null| null|kolhapur| null| null| 329| true| null| null|416109|maharashtra| null|gat no 329, tarda...|
+--------------------+--------------------+--------------------+--------+----------------+-------+--------+-------+---------------+--------+-----+-----+------------+------------+--------+--------+------+-----------+------+--------------------+
Upvotes: 1