Luis Valencia
Luis Valencia

Reputation: 33998

How to use Tweepy paginator to create a pandas dataframe

it looks like .append is deprecated now

The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.

I am trying to get tweets with tweepy (more than 100), so I use Paginator, however I am not sure how to properly append/concat rows to the pandas dataframe

paginator = tweepy.Paginator(
    client.search_recent_tweets,               # The method you want to use
    "#publictransport -is:retweet",                            # Some argument for this method
    max_results=100                        # How many tweets asked per request
)

import pandas as pd
df = pd.DataFrame()

for tweet in paginator.flatten(limit=1000): # Total number of tweets to retrieve
   df2 = df.append({'Tweet':tweet}, ignore_index = True)

I get this error:

df2.head(5)

---------------------------------------------------------------------------
StopIteration                             Traceback (most recent call last)
File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/core/formatters.py:707, in PlainTextFormatter.__call__(self, obj)
    700 stream = StringIO()
    701 printer = pretty.RepresentationPrinter(stream, self.verbose,
    702     self.max_width, self.newline,
    703     max_seq_length=self.max_seq_length,
    704     singleton_pprinters=self.singleton_printers,
    705     type_pprinters=self.type_printers,
    706     deferred_pprinters=self.deferred_printers)
--> 707 printer.pretty(obj)
    708 printer.flush()
    709 return stream.getvalue()

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/lib/pretty.py:410, in RepresentationPrinter.pretty(self, obj)
    407                         return meth(obj, self, cycle)
    408                 if cls is not object \
    409                         and callable(cls.__dict__.get('__repr__')):
--> 410                     return _repr_pprint(obj, self, cycle)
    412     return _default_pprint(obj, self, cycle)
    413 finally:

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/IPython/lib/pretty.py:778, in _repr_pprint(obj, p, cycle)
    776 """A pprint that just redirects to the normal repr function."""
    777 # Find newlines and replace them with p.break_()
--> 778 output = repr(obj)
    779 lines = output.splitlines()
    780 with p.group():

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/core/frame.py:1011, in DataFrame.__repr__(self)
   1008     return buf.getvalue()
   1010 repr_params = fmt.get_dataframe_repr_params()
-> 1011 return self.to_string(**repr_params)

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/core/frame.py:1192, in DataFrame.to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, line_width, min_rows, max_colwidth, encoding)
   1173 with option_context("display.max_colwidth", max_colwidth):
   1174     formatter = fmt.DataFrameFormatter(
   1175         self,
   1176         columns=columns,
   (...)
   1190         decimal=decimal,
   1191     )
-> 1192     return fmt.DataFrameRenderer(formatter).to_string(
   1193         buf=buf,
   1194         encoding=encoding,
   1195         line_width=line_width,
   1196     )

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1128, in DataFrameRenderer.to_string(self, buf, encoding, line_width)
   1125 from pandas.io.formats.string import StringFormatter
   1127 string_formatter = StringFormatter(self.fmt, line_width=line_width)
-> 1128 string = string_formatter.to_string()
   1129 return save_to_buffer(string, buf=buf, encoding=encoding)

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:25, in StringFormatter.to_string(self)
     24 def to_string(self) -> str:
---> 25     text = self._get_string_representation()
     26     if self.fmt.should_show_dimensions:
     27         text = "".join([text, self.fmt.dimensions_info])

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:40, in StringFormatter._get_string_representation(self)
     37 if self.fmt.frame.empty:
     38     return self._empty_info_line
---> 40 strcols = self._get_strcols()
     42 if self.line_width is None:
     43     # no need to wrap around just print the whole frame
     44     return self.adj.adjoin(1, *strcols)

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/string.py:31, in StringFormatter._get_strcols(self)
     30 def _get_strcols(self) -> list[list[str]]:
---> 31     strcols = self.fmt.get_strcols()
     32     if self.fmt.is_truncated:
     33         strcols = self._insert_dot_separators(strcols)

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:611, in DataFrameFormatter.get_strcols(self)
    607 def get_strcols(self) -> list[list[str]]:
    608     """
    609     Render a DataFrame to a list of columns (as lists of strings).
    610     """
--> 611     strcols = self._get_strcols_without_index()
    613     if self.index:
    614         str_index = self._get_formatted_index(self.tr_frame)

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:875, in DataFrameFormatter._get_strcols_without_index(self)
    871 cheader = str_columns[i]
    872 header_colwidth = max(
    873     int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader)
    874 )
--> 875 fmt_values = self.format_col(i)
    876 fmt_values = _make_fixed_width(
    877     fmt_values, self.justify, minimum=header_colwidth, adj=self.adj
    878 )
    880 max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth)

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:889, in DataFrameFormatter.format_col(self, i)
    887 frame = self.tr_frame
    888 formatter = self._get_formatter(i)
--> 889 return format_array(
    890     frame.iloc[:, i]._values,
    891     formatter,
    892     float_format=self.float_format,
    893     na_rep=self.na_rep,
    894     space=self.col_space.get(frame.columns[i]),
    895     decimal=self.decimal,
    896     leading_space=self.index,
    897 )

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1316, in format_array(values, formatter, float_format, na_rep, digits, space, justify, decimal, leading_space, quoting)
   1301     digits = get_option("display.precision")
   1303 fmt_obj = fmt_klass(
   1304     values,
   1305     digits=digits,
   (...)
   1313     quoting=quoting,
   1314 )
-> 1316 return fmt_obj.get_result()

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1347, in GenericArrayFormatter.get_result(self)
   1346 def get_result(self) -> list[str]:
-> 1347     fmt_values = self._format_strings()
   1348     return _make_fixed_width(fmt_values, self.justify)

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1410, in GenericArrayFormatter._format_strings(self)
   1408 for i, v in enumerate(vals):
   1409     if not is_float_type[i] and leading_space:
-> 1410         fmt_values.append(f" {_format(v)}")
   1411     elif is_float_type[i]:
   1412         fmt_values.append(float_format(v))

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/format.py:1390, in GenericArrayFormatter._format_strings.<locals>._format(x)
   1387     return str(x)
   1388 else:
   1389     # object dtype
-> 1390     return str(formatter(x))

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:222, in pprint_thing(thing, _nest_lvl, escape_chars, default_escapes, quote_strings, max_seq_items)
    218     result = _pprint_dict(
    219         thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items
    220     )
    221 elif is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth"):
--> 222     result = _pprint_seq(
    223         thing,
    224         _nest_lvl,
    225         escape_chars=escape_chars,
    226         quote_strings=quote_strings,
    227         max_seq_items=max_seq_items,
    228     )
    229 elif isinstance(thing, str) and quote_strings:
    230     result = f"'{as_escaped_string(thing)}'"

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:119, in _pprint_seq(seq, _nest_lvl, max_seq_items, **kwds)
    117 s = iter(seq)
    118 # handle sets, no slicing
--> 119 r = [
    120     pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
    121     for i in range(min(nitems, len(seq)))
    122 ]
    123 body = ", ".join(r)
    125 if nitems < len(seq):

File /anaconda/envs/GPSAnalysis/lib/python3.9/site-packages/pandas/io/formats/printing.py:120, in <listcomp>(.0)
    117 s = iter(seq)
    118 # handle sets, no slicing
    119 r = [
--> 120     pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)
    121     for i in range(min(nitems, len(seq)))
    122 ]
    123 body = ", ".join(r)
    125 if nitems < len(seq):

StopIteration: 

Upvotes: 0

Views: 483

Answers (1)

Laurent
Laurent

Reputation: 13478

I can't reproduce your error, so I am walking blind here, but here is one way to do it like you asked:

df = pd.concat(
    [pd.DataFrame({"Tweet": [tweet]}) for tweet in paginator.flatten(limit=1000)]
).reset_index(drop=True)

Although you do not need pd.concat or append to achieve the same result:

df = pd.DataFrame({"Tweets": [tweet for tweet in paginator.flatten(limit=1000)]})

Upvotes: 1

Related Questions