Reputation: 43
I wrote a code for elastic search where I'm giving the movie_name as search_term but when it got the match according to the jaro winkler condition i.e
for i in es_data:
if (i['_source']['entity_type'] == 'movie_entity'):
dist = distance.get_jaro_distance(search_term, i['_source']['entity_name'], winkler=True, scaling=0.1)
if dist > 0.80:
This code is returning the correct output, but when there is no match, I get an error. I tried putting else statement but the error is still happening.
Can anyone help me with this issue?
from..items import DeccanchronicleItem
import mysql.connector
from mysql.connector import Error
from mysql.connector import errorcode
from elasticsearch import Elasticsearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import boto3
import spacy
import fuzzy
from pyjarowinkler import distance
import json
import scrapy
import re
class DeccanchronicleSpider(scrapy.Spider):
name = 'a_review'
page_number = 2
start_urls = ['https://www.deccanchronicle.com/entertainment/movie-review?pg=1'
]
def parse(self, response):
items = {}
i = 1
movie_title = response.xpath('//*[@id="fullBody"]/div[4]/div[3]/div[1]/div[*]/div[2]/a/h3/text()').getall()
movie_text = response.xpath('//*[@id="fullBody"]/div[4]/div[3]/div[1]/div[*]/div[2]/a/div[1]/text()').getall()
movie_id = response.xpath('//*[@id="fullBody"]/div[4]/div[3]/div[1]/div[*]/div[2]/a/@href').getall()
items['movie_title'] = movie_title
items['movie_text'] = movie_text
items['movie_id'] = movie_id
li = items['movie_title']
for i in range(len(li)):
li_split = li[i].split(" ")
#print(movietitle)
if 'Review:' in li_split or 'review:' in li_split:
outputs = DeccanchronicleItem()
outputs['page_title'] = li[i]
outputs['review_content'] = items['movie_text'][i]
outputs['review_link'] = 'https://www.deccanchronicle.com' + str(items['movie_id'][i])
nlp = spacy.load('/Users/divyanshu/review_bot/review_bot/NER_model')
def actor_mid_ner(sentence):
doc = nlp(sentence)
detected_hash = {}
# detected_hash = { ent.label_ : ([ent.text] if ent.label_ is None else ) for ent in doc.ents}
for ent in doc.ents:
label = ent.label_
detected = detected_hash.keys()
omit = ['Unwanted']
if label not in omit:
if label not in detected:
detected_hash[label] = [ent.text]
else:
detected_hash[label].append(ent.text)
else:
detected_hash[label] = [ent.text]
return detected_hash, detected
sentence = outputs['page_title']
ner_hash, ner_keys = actor_mid_ner(sentence)
movie_name = " ".join(str(x) for x in ner_hash['MOVIE'] )
print('-----------------------------------')
print(movie_name)
print('-----------------------------------')
def elasticsearch(movie_name):
search_term = movie_name
host = 'xxxxxxxxxxxxxxx' # For example, my-test-domain.us-east-1.es.amazonaws.com
region = 'ap-southeast-1' # e.g. us-west-1
service = 'es'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)
es = Elasticsearch(
hosts = [{'host': host, 'port': 443}],
http_auth = awsauth,
use_ssl = True,
verify_certs = True,
connection_class = RequestsHttpConnection
)
body = {
"query": {
"multi_match" : {
"query": search_term,
"fields": ["entity_name", "aka"],
"fuzziness": "AUTO"
}
}
}
res = es.search(index="production-widget_id_search", body=body)
es_data = res['hits']['hits']
# print(es_data)
for i in es_data:
if (i['_source']['entity_type'] == 'movie_entity'):
dist = distance.get_jaro_distance(search_term, i['_source']['entity_name'], winkler=True, scaling=0.1)
if dist > 0.80:
return (i['_source']['entity_id'], i['_source']['entity_name'])
movie_id , movie_name_es = elasticsearch(movie_name)
review_url = outputs['review_link']
print('-----------------------------------')
print(movie_id)
print('-----------------------------------')
print(movie_name)
print('-----------------------------------')
print(movie_name_es)
print('-----------------------------------')
print(review_url)
print('***********************************')
try:
connection = mysql.connector.connect(host='localhost',
database='review_url',
user='root',
password='admin')
mySql_insert_query = """INSERT INTO k_master_movie_reviews (id, title, title_es, url)
VALUES(%s,%s,%s,%s)""",(movie_id, movie_name, movie_name_es, review_url )
cursor = connection.cursor()
cursor.execute(mySql_insert_query)
connection.commit()
print(cursor.rowcount, "Record inserted successfully into table")
cursor.close()
except mysql.connector.Error as error:
print("Failed to insert record into table {}".format(error))
finally:
if (connection.is_connected()):
connection.close()
print("MySQL connection is closed")
outputs['id'] = movie_id
outputs['title'] = movie_name
outputs['title_es'] = movie_name_es
outputs['url'] = review_url
yield outputs
pass
next_page = 'https://www.deccanchronicle.com/entertainment/movie-review?pg=' + str(DeccanchronicleSpider.page_number)
if DeccanchronicleSpider.page_number <= 5:
DeccanchronicleSpider.page_number += 1
yield response.follow(next_page, callback = self.parse)
This the error I'm getting
Traceback (most recent call last):
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/utils/defer.py", line 117, in iter_errback
yield next(it)
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/utils/python.py", line 345, in __next__
return next(self.data)
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/utils/python.py", line 345, in __next__
return next(self.data)
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/spidermiddlewares/referer.py", line 338, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/spidermiddlewares/urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/spidermiddlewares/depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "/Users/divyanshu/env/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 64, in _evaluate_iterable
for r in iterable:
File "/Users/divyanshu/review_bot/review_bot/spiders/a.py", line 515, in parse
movie_id , movie_name_es = elasticsearch(movie_name)
TypeError: cannot unpack non-iterable NoneType object
Upvotes: 0
Views: 466
Reputation: 1168
You can try removing this line :
movie_id , movie_name_es = elasticsearch(movie_name)
And instead do this :
es_results = elasticsearch(movie_name)
movie_id = es_results[0]
movie_name_es = es_results[1]
I am giving this solution because your error is pointing to this line -
File "/Users/divyanshu/review_bot/review_bot/spiders/a.py", line 515, in parse
movie_id , movie_name_es = elasticsearch(movie_name)
TypeError: cannot unpack non-iterable NoneType object
Upvotes: 0
Reputation: 27577
Let's have a loom at the last parts of your elasticsearch()
function:
res = es.search(index="production-widget_id_search", body=body)
es_data = res['hits']['hits']
# print(es_data)
for i in es_data:
if (i['_source']['entity_type'] == 'movie_entity'):
dist = distance.get_jaro_distance(search_term, i['_source']['entity_name'], winkler=True, scaling=0.1)
if dist > 0.80:
return (i['_source']['entity_id'], i['_source']['entity_name'])
You have a for
loop, and in each iteration, there are two if
conditions. If there isn't a single iteration in which both conditions are met, you function will never reach the return
statement, thus, it will return None
.
To fix the unpacking, you can add a another return
statement into your function that will be initialized if the for
loop doesn't return anything::
res = es.search(index="production-widget_id_search", body=body)
es_data = res['hits']['hits']
# print(es_data)
for i in es_data:
if (i['_source']['entity_type'] == 'movie_entity'):
dist = distance.get_jaro_distance(search_term, i['_source']['entity_name'], winkler=True, scaling=0.1)
if dist > 0.80:
return (i['_source']['entity_id'], i['_source']['entity_name'])
return (None, None)
Upvotes: 0
Reputation: 941
That's because your elasticsearch()
function when there is no match will return None
which you then immediately unpack into movie_id
and movie_name_es
. I suggest adding return (None, None)
to the end of the elasticsearch()
function.
Upvotes: 1