Reputation: 794
I have a highly weird data structure which is a list of tuples. Each tuple has five elements, where the first is an identification string and the other four strings of floating numbers (very weird, that they aren't just floats). Sorry, I get that data from others.
I would like to average all numbers of the 2-5 numbers where the first indices are the same. Example:
[('ch', ' 0.8307', '0.8583', '0.8047', ' 0.969'),
('de', ' 0.721', '0.7529', '0.6917', ' 0.968'),
('en', ' 0.8441', '0.8732', '0.8168', ' 0.9569'),
('fn', ' 0.8207', '0.8574', '0.7870', ' 0.9609'),
('ch', ' 0.466', '0.572', '0.7733', ' 0.969'),
('de', ' 0.322', '0.385', '0.5431', ' 0.968'),
('sp', ' 0.7609', '0.7893', '0.7344', ' 0.9663'),
('ti', ' 0.8135', '0.8430', '0.7860', ' 0.9662')]
The output should just shrink all elements having the same first index together and average their values, so it will be something like (I did not average the values in my example output here):
[('ch', ' 0.8307', '0.8583', '0.8047', ' 0.969'),
('de', ' 0.721', '0.7529', '0.6917', ' 0.968'),
('en', ' 0.8441', '0.8732', '0.8168', ' 0.9569'),
('fn', ' 0.8207', '0.8574', '0.7870', ' 0.9609'),
('sp', ' 0.7609', '0.7893', '0.7344', ' 0.9663'),
('ti', ' 0.8135', '0.8430', '0.7860', ' 0.9662')]
Is there anything very clever I could do here, instead of making a giant for-loop extracting all of it?
Upvotes: 3
Views: 651
Reputation: 51683
With pandas it is even more trivial:
data = [('ch', ' 0.8307', '0.8583', '0.8047', ' 0.969'),
('de', ' 0.721', '0.7529', '0.6917', ' 0.968'),
('en', ' 0.8441', '0.8732', '0.8168', ' 0.9569'),
('fn', ' 0.8207', '0.8574', '0.7870', ' 0.9609'),
('ch', ' 0.466', '0.572', '0.7733', ' 0.969'),
('de', ' 0.322', '0.385', '0.5431', ' 0.968'),
('sp', ' 0.7609', '0.7893', '0.7344', ' 0.9663'),
('ti', ' 0.8135', '0.8430', '0.7860', ' 0.9662')]
import pandas as pd
df = pd.DataFrame(data, dtype=float)
print(df.groupby(0).mean())
Output:
1 2 3 4
0
ch 0.64835 0.71515 0.7890 0.9690 # pandas displays "nice" numbers,
de 0.52150 0.56895 0.6174 0.9680 # it contains the "correct" ones
en 0.84410 0.87320 0.8168 0.9569
fn 0.82070 0.85740 0.7870 0.9609
sp 0.76090 0.78930 0.7344 0.9663
ti 0.81350 0.84300 0.7860 0.9662
Upvotes: 3
Reputation: 23825
something like the below (zero imports based solution)
avg_data = {}
data = [('ch', ' 0.8307', '0.8583', '0.8047', ' 0.969'),
('de', ' 0.721', '0.7529', '0.6917', ' 0.968'),
('en', ' 0.8441', '0.8732', '0.8168', ' 0.9569'),
('fn', ' 0.8207', '0.8574', '0.7870', ' 0.9609'),
('ch', ' 0.466', '0.572', '0.7733', ' 0.969'),
('de', ' 0.322', '0.385', '0.5431', ' 0.968'),
('sp', ' 0.7609', '0.7893', '0.7344', ' 0.9663'),
('ti', ' 0.8135', '0.8430', '0.7860', ' 0.9662')]
for entry in data:
if entry[0] not in avg_data:
avg_data[entry[0]] = [0, [0, 0, 0, 0]]
for idx, x in enumerate(entry[1:]):
avg_data[entry[0]][1][idx] += float(x)
avg_data[entry[0]][0] += 1
result = []
for k, v in avg_data.items():
result.append([k])
result[-1].extend([x / v[0] for x in v[1]])
result[-1] = tuple(result[-1])
print(result)
output
[('ch', 0.64835, 0.71515, 0.7889999999999999, 0.969), ('de', 0.5215, 0.5689500000000001, 0.6174, 0.968), ('en', 0.8441, 0.8732, 0.8168, 0.9569), ('fn', 0.8207, 0.8574, 0.787, 0.9609), ('sp', 0.7609, 0.7893, 0.7344, 0.9663), ('ti', 0.8135, 0.843, 0.786, 0.9662)]
Upvotes: 3
Reputation: 24279
You can first create a dict to collect all values related to each id, then calculate the means:
from collections import defaultdict
data = [('ch', ' 0.8307', '0.8583', '0.8047', ' 0.969'),
('de', ' 0.721', '0.7529', '0.6917', ' 0.968'),
('en', ' 0.8441', '0.8732', '0.8168', ' 0.9569'),
('fn', ' 0.8207', '0.8574', '0.7870', ' 0.9609'),
('ch', ' 0.466', '0.572', '0.7733', ' 0.969'),
('de', ' 0.322', '0.385', '0.5431', ' 0.968'),
('sp', ' 0.7609', '0.7893', '0.7344', ' 0.9663'),
('ti', ' 0.8135', '0.8430', '0.7860', ' 0.9662')]
def mean(lst):
return sum(lst)/len(lst)
d = defaultdict(list)
for id, *values in data:
d[id].append(list(map(float, values)))
out = {id: [mean(column) for column in zip(*values)] for id, values in d.items() }
print(out)
# {'ch': [0.64835, 0.71515, 0.7889999999999999, 0.969],
# 'de': [0.5215, 0.5689500000000001, 0.6174, 0.968],
# 'en': [0.8441, 0.8732, 0.8168, 0.9569],
# 'fn': [0.8207, 0.8574, 0.787, 0.9609],
# 'sp': [0.7609, 0.7893, 0.7344, 0.9663],
# 'ti': [0.8135, 0.843, 0.786, 0.9662]}
In for id, *values in data:
, we iterate on the tuples of data
, and put the first item of the tuple in id
, and the remaining values in values
.
Also, using a defaultdict(list)
allows us to simply append the new list of values for each key, as an empty list will be automatically created if the list doesn't exist yet.
Upvotes: 5
Reputation: 51683
Some number wrangling:
data = [('ch', ' 0.8307', '0.8583', '0.8047', ' 0.969'),
('de', ' 0.721', '0.7529', '0.6917', ' 0.968'),
('en', ' 0.8441', '0.8732', '0.8168', ' 0.9569'),
('fn', ' 0.8207', '0.8574', '0.7870', ' 0.9609'),
('ch', ' 0.466', '0.572', '0.7733', ' 0.969'),
('de', ' 0.322', '0.385', '0.5431', ' 0.968'),
('sp', ' 0.7609', '0.7893', '0.7344', ' 0.9663'),
('ti', ' 0.8135', '0.8430', '0.7860', ' 0.9662')]
from pprint import pprint
from collections import defaultdict
d = defaultdict(list)
for t in data:
d[t[0]].append(list(map(float, t[1:])))
pprint(d)
for key, values in d.items():
w = len(values)
if w > 1:
d[key] = [sum(numbers) / w for numbers in zip(*values)]
else:
d[key] = d[key][0]
pprint(d)
Output:
# after converting to float and collecting into lists
defaultdict(<class 'list'>,
{'ch': [[0.8307, 0.8583, 0.8047, 0.969],
[0.466, 0.572, 0.7733, 0.969]],
'de': [[0.721, 0.7529, 0.6917, 0.968],
[0.322, 0.385, 0.5431, 0.968]],
'en': [[0.8441, 0.8732, 0.8168, 0.9569]],
'fn': [[0.8207, 0.8574, 0.787, 0.9609]],
'sp': [[0.7609, 0.7893, 0.7344, 0.9663]],
'ti': [[0.8135, 0.843, 0.786, 0.9662]]})
# after averaging
defaultdict(<class 'list'>,
{'ch': [0.64835, 0.71515, 0.7889999999999999, 0.969],
'de': [0.5215, 0.5689500000000001, 0.6174, 0.968],
'en': [0.8441, 0.8732, 0.8168, 0.9569],
'fn': [0.8207, 0.8574, 0.787, 0.9609],
'sp': [0.7609, 0.7893, 0.7344, 0.9663],
'ti': [0.8135, 0.843, 0.786, 0.9662]})
Upvotes: 4