Reputation: 29
I'm trying to merge 2 lists output into one list from a for loop. Tried append()
and extend()
to no avail.
Here's my code:
c_files = ['file1', 'file2']
doc_count = 0
comb_cran = []
for fname in c_files:
with open(fname,'r') as cr:
cran = cr.read()
doc_count = doc_count + 1
docID = os.path.basename(fname)
#TOKENIZING :
# remove SGML Tags
removedsgml_cran = BeautifulSoup(cran, "html.parser")
clean_cran = removedsgml_cran.get_text()
# remove non-alphanumeric
non_alpha = re.compile('([^\s\w]|_)+')
alpha = non_alpha.sub(' ', clean_cran)
alpha_lower = alpha.lower()
# word Tokenizing using nltk
tokenized = word_tokenize(alpha_lower)
# combine the list
#comb_cran.extend(tokenized)
cword_removed = [(w, docID, 1) for w in tokenized if not w in c_words]
print(cword_removed)
and the results:
[('1', 'cranfield0001', 1), ('experimental', 'cranfield0001', 1), ('investigation', 'cranfield0001', 1), ('aerodynamics', 'cranfield0001', 1), ('slipstream', 'cranfield0001', 1), ('brenckman', 'cranfield0001', 1), ('ae', 'cranfield0001', 1), ('scs', 'cranfield0001', 1), ('25', 'cranfield0001', 1), ('1958', 'cranfield0001', 1), ('324', 'cranfield0001', 1), ('experimental', 'cranfield0001', 1), ('study', 'cranfield0001', 1), ('propeller', 'cranfield0001', 1), ('slipstream', 'cranfield0001', 1), ('made', 'cranfield0001', 1), ('order', 'cranfield0001', 1), ('determine', 'cranfield0001', 1), ('spanwise', 'cranfield0001', 1), ('distribution', 'cranfield0001', 1), ('lift', 'cranfield0001', 1), ('increase', 'cranfield0001', 1), ('due', 'cranfield0001', 1), ('slipstream', 'cranfield0001', 1), ('angles', 'cranfield0001', 1), ('attack', 'cranfield0001', 1), ('free', 'cranfield0001', 1), ('stream', 'cranfield0001', 1), ('slipstream', 'cranfield0001', 1), ('velocity', 'cranfield0001', 1), ('ratios', 'cranfield0001', 1), ('results', 'cranfield0001', 1), ('intended', 'cranfield0001', 1), ('evaluation', 'cranfield0001', 1), ('basis', 'cranfield0001', 1), ('theoretical', 'cranfield0001', 1), ('treatments', 'cranfield0001', 1), ('problem', 'cranfield0001', 1), ('comparative', 'cranfield0001', 1), ('span', 'cranfield0001', 1), ('loading', 'cranfield0001', 1), ('curves', 'cranfield0001', 1), ('supporting', 'cranfield0001', 1), ('evidence', 'cranfield0001', 1), ('showed', 'cranfield0001', 1), ('substantial', 'cranfield0001', 1), ('lift', 'cranfield0001', 1), ('increment', 'cranfield0001', 1), ('produced', 'cranfield0001', 1), ('slipstream', 'cranfield0001', 1), ('due', 'cranfield0001', 1), ('destalling', 'cranfield0001', 1), ('boundary', 'cranfield0001', 1), ('layer', 'cranfield0001', 1), ('control', 'cranfield0001', 1), ('effect', 'cranfield0001', 1), ('integrated', 'cranfield0001', 1), ('remaining', 'cranfield0001', 1), ('lift', 'cranfield0001', 1), ('increment', 'cranfield0001', 1), ('subtracting', 'cranfield0001', 1), ('destalling', 'cranfield0001', 1), ('lift', 'cranfield0001', 1), ('found', 'cranfield0001', 1), ('agree', 'cranfield0001', 1), ('potential', 'cranfield0001', 1), ('flow', 'cranfield0001', 1), ('theory', 'cranfield0001', 1), ('empirical', 'cranfield0001', 1), ('evaluation', 'cranfield0001', 1), ('destalling', 'cranfield0001', 1), ('effects', 'cranfield0001', 1), ('made', 'cranfield0001', 1), ('specific', 'cranfield0001', 1), ('configuration', 'cranfield0001', 1), ('experiment', 'cranfield0001', 1)]
[('2', 'cranfield0002', 1), ('simple', 'cranfield0002', 1), ('shear', 'cranfield0002', 1), ('flow', 'cranfield0002', 1), ('past', 'cranfield0002', 1), ('flat', 'cranfield0002', 1), ('plate', 'cranfield0002', 1), ('incompressible', 'cranfield0002', 1), ('fluid', 'cranfield0002', 1), ('small', 'cranfield0002', 1), ('viscosity', 'cranfield0002', 1), ('yili', 'cranfield0002', 1), ('department', 'cranfield0002', 1), ('aeronautical', 'cranfield0002', 1), ('engineering', 'cranfield0002', 1), ('rensselaer', 'cranfield0002', 1), ('polytechnic', 'cranfield0002', 1), ('institute', 'cranfield0002', 1), ('troy', 'cranfield0002', 1), ('study', 'cranfield0002', 1), ('high', 'cranfield0002', 1), ('speed', 'cranfield0002', 1), ('viscous', 'cranfield0002', 1), ('flow', 'cranfield0002', 1), ('past', 'cranfield0002', 1), ('dimensional', 'cranfield0002', 1), ('curved', 'cranfield0002', 1), ('shock', 'cranfield0002', 1), ('wave', 'cranfield0002', 1), ('emitting', 'cranfield0002', 1), ('nose', 'cranfield0002', 1), ('leading', 'cranfield0002', 1), ('edge', 'cranfield0002', 1), ('exists', 'cranfield0002', 1), ('inviscid', 'cranfield0002', 1), ('rotational', 'cranfield0002', 1), ('flow', 'cranfield0002', 1), ('region', 'cranfield0002', 1), ('shock', 'cranfield0002', 1), ('wave', 'cranfield0002', 1), ('boundary', 'cranfield0002', 1), ('layer', 'cranfield0002', 1), ('situation', 'cranfield0002', 1), ('arises', 'cranfield0002', 1), ('instance', 'cranfield0002', 1), ('study', 'cranfield0002', 1), ('hypersonic', 'cranfield0002', 1), ('viscous', 'cranfield0002', 1), ('flow', 'cranfield0002', 1), ('past', 'cranfield0002', 1), ('flat', 'cranfield0002', 1), ('plate', 'cranfield0002', 1), ('situation', 'cranfield0002', 1), ('prandtl', 'cranfield0002', 1), ('classical', 'cranfield0002', 1), ('boundary', 'cranfield0002', 1), ('layer', 'cranfield0002', 1), ('problem', 'cranfield0002', 1), ('prandtl', 'cranfield0002', 1), ('original', 'cranfield0002', 1), ('problem', 'cranfield0002', 1), ('inviscid', 'cranfield0002', 1), ('free', 'cranfield0002', 1), ('stream', 'cranfield0002', 1), ('boundary', 'cranfield0002', 1), ('layer', 'cranfield0002', 1), ('irrotational', 'cranfield0002', 1), ('hypersonic', 'cranfield0002', 1), ('boundary', 'cranfield0002', 1), ('layer', 'cranfield0002', 1), ('problem', 'cranfield0002', 1), ('inviscid', 'cranfield0002', 1), ('free', 'cranfield0002', 1), ('stream', 'cranfield0002', 1), ('considered', 'cranfield0002', 1), ('rotational', 'cranfield0002', 1), ('effects', 'cranfield0002', 1), ('vorticity', 'cranfield0002', 1), ('recently', 'cranfield0002', 1), ('discussed', 'cranfield0002', 1), ('ferri', 'cranfield0002', 1), ('libby', 'cranfield0002', 1), ('present', 'cranfield0002', 1), ('paper', 'cranfield0002', 1), ('simple', 'cranfield0002', 1), ('shear', 'cranfield0002', 1), ('flow', 'cranfield0002', 1), ('past', 'cranfield0002', 1), ('flat', 'cranfield0002', 1), ('plate', 'cranfield0002', 1), ('fluid', 'cranfield0002', 1), ('small', 'cranfield0002', 1), ('viscosity', 'cranfield0002', 1), ('investigated', 'cranfield0002', 1), ('shown', 'cranfield0002', 1), ('problem', 'cranfield0002', 1), ('treated', 'cranfield0002', 1), ('boundary', 'cranfield0002', 1), ('layer', 'cranfield0002', 1), ('approximation', 'cranfield0002', 1), ('feature', 'cranfield0002', 1), ('free', 'cranfield0002', 1), ('stream', 'cranfield0002', 1), ('constant', 'cranfield0002', 1), ('vorticity', 'cranfield0002', 1), ('discussion', 'cranfield0002', 1), ('restricted', 'cranfield0002', 1), ('dimensional', 'cranfield0002', 1), ('incompressible', 'cranfield0002', 1), ('steady', 'cranfield0002', 1), ('flow', 'cranfield0002', 1)]
The results are correct, however they are separate lists printed from each loop. I need them to be in a single list.
I tried putting the print outside of the loop, then it prints only the second list.
Upvotes: 0
Views: 4153
Reputation: 79
You are just overwriting your list each time and then printing the new one. You can add lists. Define your list before the loop and just add the new list on to what is already there using full_list += new_list
. and put the final print statement outside the for loop.
c_files =['file1','file2']
doc_count = 0
comb_cran = []
cword_removed = []
for fname in c_files:
with open(fname,'r') as cr:
cran = cr.read()
doc_count = doc_count + 1
docID = os.path.basename(fname)
#TOKENIZING :
# remove SGML Tags
removedsgml_cran = BeautifulSoup(cran,"html.parser")
clean_cran = removedsgml_cran.get_text()
# remove non-alphanumeric
non_alpha = re.compile('([^\s\w]|_)+')
alpha = non_alpha.sub(' ',clean_cran)
alpha_lower = alpha.lower()
# word Tokenizing using nltk
tokenized = word_tokenize(alpha_lower)
# combine the list
#comb_cran.extend(tokenized)
cword_removed += [(w,docID,1) for w in tokenized if not w in c_words]
print(cword_removed)
Upvotes: 0
Reputation: 3848
So here you are encountering the stack and a concept called scope.
In your program, the list in question is cword_removed
. You only declare and initialize it within the loop. When your program gets to cword_removed
the first time, it creates a variable of type list
and populates it with the list comprehension you have defined. When the program goes around the loop the second time, it overwrites the variable with the new values! That's because the local variable cword_removed
is solely within the scope of the loop.
To remedy this, declare the variable outside of the loop, and increment via list comprehension:
c_files =['file1','file2']
doc_count = 0
comb_cran = []
cword_removed = []
for fname in c_files:
with open(fname,'r') as cr:
.
. # the rest of your code
.
cword_removed += [(w,docID,1) for w in tokenized if not w in c_words]
Upvotes: 1
Reputation: 1687
As you loop over the files, then you will get each list for a file. You can easily combine the list using amazing simple operator '+'. Your right codes are:
c_files =['file1','file2']
doc_count = 0
comb_cran = []
total_list=[] #Your final wanted list
for fname in c_files:
with open(fname,'r') as cr:
...your codes
#Combine the list by simple '+' operator
total_list = total_list+[(w,docID,1) for w in tokenized if not w in c_words]
print(cword_removed)
Upvotes: 0