Reputation: 5771
How can I create a koalas dataframe with index from another dataframe? I can do this in pandas but I am struggling to achieve the same in koalas. Following are my attempts so far:
from databricks import koalas as pd
import pandas
pandas (works):
dft = pandas.DataFrame({'a':[1,2,3],'b':[0,1,0]},index=[11,12,13])
dft1 = pandas.DataFrame({'a':[2,21,31],'c':[3,4,5]}, index=dft.index)
koalas (fails with error):
dft = pd.DataFrame({'a':[1,2,3],'b':[0,1,0]},index=[11,12,13])
dft1 = pd.DataFrame({'a':[2,21,31],'c':[3,4,5]}, index=dft.index)
output:
ValueError Traceback (most recent call last)
/tmp/ipykernel_2826623/2112004205.py in <module>
1 dft = pd.DataFrame({'a':[1,2,3],'b':[0,1,0]},index=[11,12,13])
----> 2 dft1 = pd.DataFrame({'a':[2,21,31],'c':[3,4,5]}, index=dft.index)
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pyspark/pandas/frame.py in __init__(self, data, index, columns, dtype, copy)
517 pdf = data
518 else:
--> 519 pdf = pd.DataFrame(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
520 internal = InternalFrame.from_pandas(pdf)
521
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
612 elif isinstance(data, dict):
613 # GH#38939 de facto copy defaults to False only in non-dict cases
--> 614 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
615 elif isinstance(data, ma.MaskedArray):
616 import numpy.ma.mrecords as mrecords
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pandas/core/internals/construction.py in dict_to_mgr(data, index, columns, dtype, typ, copy)
462 # TODO: can we get rid of the dt64tz special case above?
463
--> 464 return arrays_to_mgr(
465 arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy
466 )
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity, typ, consolidate)
119 index = _extract_index(arrays)
120 else:
--> 121 index = ensure_index(index)
122
123 # don't force copy because getting jammed in an ndarray anyway
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pandas/core/indexes/base.py in ensure_index(index_like, copy)
6334 else:
6335
-> 6336 return Index(index_like, copy=copy)
6337
6338
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pandas/core/indexes/base.py in __new__(cls, data, dtype, copy, name, tupleize_cols, **kwargs)
482 data = list(data)
483
--> 484 if data and all(isinstance(e, tuple) for e in data):
485 # we must be all tuples, otherwise don't construct
486 # 10697
~/miniconda3/envs/pyspark/lib/python3.9/site-packages/pyspark/pandas/indexes/base.py in __bool__(self)
2605
2606 def __bool__(self) -> bool:
-> 2607 raise ValueError(
2608 "The truth value of a {0} is ambiguous. "
2609 "Use a.empty, a.bool(), a.item(), a.any() or a.all().".format(self.__class__.__name__)
ValueError: The truth value of a Int64Index is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
pandas (works):
dft = pandas.DataFrame({'a':[1,2,3],'b':[0,1,0]},index=[11,12,13])
dft1 = pandas.DataFrame({'a':[2,21,31],'c':[3,4,5]})
dft1.index=dft.index
print(dft1)
output:
a c
11 2 3
12 21 4
13 31 5
koalas (fails without error):
dft = pd.DataFrame({'a':[1,2,3],'b':[0,1,0]},index=[11,12,13])
dft1 = pd.DataFrame({'a':[2,21,31],'c':[3,4,5]})
dft1.index=dft.index
print(dft1)
output:
a c
0 2 3
1 21 4
2 31 5
print(dft1.index)
output: Int64Index([0, 1, 2], dtype='int64')
Upvotes: 2
Views: 324
Reputation: 765
dft = ps.DataFrame({'a':[1,2,3],'b':[0,1,0]},index=[11,12,13])
dft1 = ps.DataFrame({'a':[2,21,31],'c':[3,4,5]}, index=dft.index.tolist())
dft1
out:
a c
11 2 3
12 21 4
13 31 5
Upvotes: 0
Reputation: 5771
I have put together a hacky solution for now. If someone has a better solution please let me know:
dft = dft = pd.DataFrame({'a':[1,2,3],'b':[0,1,0]},index=[11,12,13])
dft1 = dft1 = pd.DataFrame({'a':[2,21,31],'c':[3,4,5]})
index = dft.index
index = index.to_series()
index = index.reset_index(drop=True)
pd.set_option('compute.ops_on_diff_frames',True)
dft1['r'] = index
dft1 = dft1.set_index('r',drop=True)
dft1.index.name = dft.index.name
pd.reset_option('compute.ops_on_diff_frames')
dft1
output:
a c
11 2 3
12 21 4
13 31 5
Upvotes: 1