· 6 years ago · Oct 15, 2019, 04:12 PM
1"""
2DataFrame
3---------
4An efficient 2D container for potentially mixed-type time series or other
5labeled data series.
6
7Similar to its R counterpart, data.frame, except providing automatic data
8alignment and a host of useful data manipulation methods having to do with the
9labeling information
10"""
11from __future__ import division
12# pylint: disable=E1101,E1103
13# pylint: disable=W0212,W0231,W0703,W0622
14
15import functools
16import collections
17import itertools
18import sys
19import types
20import warnings
21from textwrap import dedent
22
23import numpy as np
24import numpy.ma as ma
25
26from pandas.core.accessor import CachedAccessor
27from pandas.core.dtypes.cast import (
28 maybe_upcast,
29 cast_scalar_to_array,
30 construct_1d_arraylike_from_scalar,
31 maybe_cast_to_datetime,
32 maybe_infer_to_datetimelike,
33 maybe_convert_platform,
34 maybe_downcast_to_dtype,
35 invalidate_string_dtypes,
36 coerce_to_dtypes,
37 maybe_upcast_putmask,
38 find_common_type)
39from pandas.core.dtypes.common import (
40 is_categorical_dtype,
41 is_object_dtype,
42 is_extension_type,
43 is_extension_array_dtype,
44 is_datetimetz,
45 is_datetime64_any_dtype,
46 is_bool_dtype,
47 is_integer_dtype,
48 is_float_dtype,
49 is_integer,
50 is_scalar,
51 is_dtype_equal,
52 needs_i8_conversion,
53 _get_dtype_from_object,
54 _ensure_float64,
55 _ensure_int64,
56 _ensure_platform_int,
57 is_list_like,
58 is_nested_list_like,
59 is_iterator,
60 is_sequence,
61 is_named_tuple)
62from pandas.core.dtypes.concat import _get_sliced_frame_result_type
63from pandas.core.dtypes.missing import isna, notna
64
65
66from pandas.core.generic import NDFrame, _shared_docs
67from pandas.core.index import (Index, MultiIndex, _ensure_index,
68 _ensure_index_from_sequences)
69from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
70 check_bool_indexer)
71from pandas.core.internals import (BlockManager,
72 create_block_manager_from_arrays,
73 create_block_manager_from_blocks)
74from pandas.core.series import Series
75from pandas.core.arrays import Categorical, ExtensionArray
76import pandas.core.algorithms as algorithms
77from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
78 OrderedDict, raise_with_traceback)
79from pandas import compat
80from pandas.compat import PY36
81from pandas.compat.numpy import function as nv
82from pandas.util._decorators import (Appender, Substitution,
83 rewrite_axis_style_signature)
84from pandas.util._validators import (validate_bool_kwarg,
85 validate_axis_style_args)
86
87from pandas.core.indexes.period import PeriodIndex
88from pandas.core.indexes.datetimes import DatetimeIndex
89from pandas.core.indexes.timedeltas import TimedeltaIndex
90
91import pandas.core.common as com
92import pandas.core.nanops as nanops
93import pandas.core.ops as ops
94import pandas.io.formats.console as console
95import pandas.io.formats.format as fmt
96from pandas.io.formats.printing import pprint_thing
97import pandas.plotting._core as gfx
98
99from pandas._libs import lib, algos as libalgos
100
101from pandas.core.config import get_option
102
103# ---------------------------------------------------------------------
104# Docstring templates
105
106_shared_doc_kwargs = dict(
107 axes='index, columns', klass='DataFrame',
108 axes_single_arg="{0 or 'index', 1 or 'columns'}",
109 axis="""
110 axis : {0 or 'index', 1 or 'columns'}, default 0
111 - 0 or 'index': apply function to each column.
112 - 1 or 'columns': apply function to each row.""",
113 optional_by="""
114 by : str or list of str
115 Name or list of names to sort by.
116
117 - if `axis` is 0 or `'index'` then `by` may contain index
118 levels and/or column labels
119 - if `axis` is 1 or `'columns'` then `by` may contain column
120 levels and/or index labels
121
122 .. versionchanged:: 0.23.0
123 Allow specifying index or column level names.""",
124 versionadded_to_excel='',
125 optional_labels="""labels : array-like, optional
126 New labels / index to conform the axis specified by 'axis' to.""",
127 optional_axis="""axis : int or str, optional
128 Axis to target. Can be either the axis name ('index', 'columns')
129 or number (0, 1).""",
130)
131
132_numeric_only_doc = """numeric_only : boolean, default None
133 Include only float, int, boolean data. If None, will attempt to use
134 everything, then use only numeric data
135"""
136
137_merge_doc = """
138Merge DataFrame objects by performing a database-style join operation by
139columns or indexes.
140
141If joining columns on columns, the DataFrame indexes *will be
142ignored*. Otherwise if joining indexes on indexes or indexes on a column or
143columns, the index will be passed on.
144
145Parameters
146----------%s
147right : DataFrame
148how : {'left', 'right', 'outer', 'inner'}, default 'inner'
149 * left: use only keys from left frame, similar to a SQL left outer join;
150 preserve key order
151 * right: use only keys from right frame, similar to a SQL right outer join;
152 preserve key order
153 * outer: use union of keys from both frames, similar to a SQL full outer
154 join; sort keys lexicographically
155 * inner: use intersection of keys from both frames, similar to a SQL inner
156 join; preserve the order of the left keys
157on : label or list
158 Column or index level names to join on. These must be found in both
159 DataFrames. If `on` is None and not merging on indexes then this defaults
160 to the intersection of the columns in both DataFrames.
161left_on : label or list, or array-like
162 Column or index level names to join on in the left DataFrame. Can also
163 be an array or list of arrays of the length of the left DataFrame.
164 These arrays are treated as if they are columns.
165right_on : label or list, or array-like
166 Column or index level names to join on in the right DataFrame. Can also
167 be an array or list of arrays of the length of the right DataFrame.
168 These arrays are treated as if they are columns.
169left_index : boolean, default False
170 Use the index from the left DataFrame as the join key(s). If it is a
171 MultiIndex, the number of keys in the other DataFrame (either the index
172 or a number of columns) must match the number of levels
173right_index : boolean, default False
174 Use the index from the right DataFrame as the join key. Same caveats as
175 left_index
176sort : boolean, default False
177 Sort the join keys lexicographically in the result DataFrame. If False,
178 the order of the join keys depends on the join type (how keyword)
179suffixes : 2-length sequence (tuple, list, ...)
180 Suffix to apply to overlapping column names in the left and right
181 side, respectively
182copy : boolean, default True
183 If False, do not copy data unnecessarily
184indicator : boolean or string, default False
185 If True, adds a column to output DataFrame called "_merge" with
186 information on the source of each row.
187 If string, column with information on source of each row will be added to
188 output DataFrame, and column will be named value of string.
189 Information column is Categorical-type and takes on a value of "left_only"
190 for observations whose merge key only appears in 'left' DataFrame,
191 "right_only" for observations whose merge key only appears in 'right'
192 DataFrame, and "both" if the observation's merge key is found in both.
193
194validate : string, default None
195 If specified, checks if merge is of specified type.
196
197 * "one_to_one" or "1:1": check if merge keys are unique in both
198 left and right datasets.
199 * "one_to_many" or "1:m": check if merge keys are unique in left
200 dataset.
201 * "many_to_one" or "m:1": check if merge keys are unique in right
202 dataset.
203 * "many_to_many" or "m:m": allowed, but does not result in checks.
204
205 .. versionadded:: 0.21.0
206
207Notes
208-----
209Support for specifying index levels as the `on`, `left_on`, and
210`right_on` parameters was added in version 0.23.0
211
212Examples
213--------
214
215>>> A >>> B
216 lkey value rkey value
2170 foo 1 0 foo 5
2181 bar 2 1 bar 6
2192 baz 3 2 qux 7
2203 foo 4 3 bar 8
221
222>>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')
223 lkey value_x rkey value_y
2240 foo 1 foo 5
2251 foo 4 foo 5
2262 bar 2 bar 6
2273 bar 2 bar 8
2284 baz 3 NaN NaN
2295 NaN NaN qux 7
230
231Returns
232-------
233merged : DataFrame
234 The output type will the be same as 'left', if it is a subclass
235 of DataFrame.
236
237See also
238--------
239merge_ordered
240merge_asof
241DataFrame.join
242"""
243
244# -----------------------------------------------------------------------
245# DataFrame class
246
247
248class DataFrame(NDFrame):
249 """ Two-dimensional size-mutable, potentially heterogeneous tabular data
250 structure with labeled axes (rows and columns). Arithmetic operations
251 align on both row and column labels. Can be thought of as a dict-like
252 container for Series objects. The primary pandas data structure.
253
254 Parameters
255 ----------
256 data : numpy ndarray (structured or homogeneous), dict, or DataFrame
257 Dict can contain Series, arrays, constants, or list-like objects
258
259 .. versionchanged :: 0.23.0
260 If data is a dict, argument order is maintained for Python 3.6
261 and later.
262
263 index : Index or array-like
264 Index to use for resulting frame. Will default to RangeIndex if
265 no indexing information part of input data and no index provided
266 columns : Index or array-like
267 Column labels to use for resulting frame. Will default to
268 RangeIndex (0, 1, 2, ..., n) if no column labels are provided
269 dtype : dtype, default None
270 Data type to force. Only a single dtype is allowed. If None, infer
271 copy : boolean, default False
272 Copy data from inputs. Only affects DataFrame / 2d ndarray input
273
274 Examples
275 --------
276 Constructing DataFrame from a dictionary.
277
278 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
279 >>> df = pd.DataFrame(data=d)
280 >>> df
281 col1 col2
282 0 1 3
283 1 2 4
284
285 Notice that the inferred dtype is int64.
286
287 >>> df.dtypes
288 col1 int64
289 col2 int64
290 dtype: object
291
292 To enforce a single dtype:
293
294 >>> df = pd.DataFrame(data=d, dtype=np.int8)
295 >>> df.dtypes
296 col1 int8
297 col2 int8
298 dtype: object
299
300 Constructing DataFrame from numpy ndarray:
301
302 >>> df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),
303 ... columns=['a', 'b', 'c', 'd', 'e'])
304 >>> df2
305 a b c d e
306 0 2 8 8 3 4
307 1 4 2 9 0 9
308 2 1 0 7 8 0
309 3 5 1 7 1 3
310 4 6 0 2 4 2
311
312 See also
313 --------
314 DataFrame.from_records : constructor from tuples, also record arrays
315 DataFrame.from_dict : from dicts of Series, arrays, or dicts
316 DataFrame.from_items : from sequence of (key, value) pairs
317 pandas.read_csv, pandas.read_table, pandas.read_clipboard
318 """
319
320 @property
321 def _constructor(self):
322 return DataFrame
323
324 _constructor_sliced = Series
325 _deprecations = NDFrame._deprecations | frozenset(
326 ['sortlevel', 'get_value', 'set_value', 'from_csv', 'from_items'])
327 _accessors = set()
328
329 @property
330 def _constructor_expanddim(self):
331 from pandas.core.panel import Panel
332 return Panel
333
334 def __init__(self, data=None, index=None, columns=None, dtype=None,
335 copy=False):
336 if data is None:
337 data = {}
338 if dtype is not None:
339 dtype = self._validate_dtype(dtype)
340
341 if isinstance(data, DataFrame):
342 data = data._data
343
344 if isinstance(data, BlockManager):
345 mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
346 dtype=dtype, copy=copy)
347 elif isinstance(data, dict):
348 mgr = self._init_dict(data, index, columns, dtype=dtype)
349 elif isinstance(data, ma.MaskedArray):
350 import numpy.ma.mrecords as mrecords
351 # masked recarray
352 if isinstance(data, mrecords.MaskedRecords):
353 mgr = _masked_rec_array_to_mgr(data, index, columns, dtype,
354 copy)
355
356 # a masked array
357 else:
358 mask = ma.getmaskarray(data)
359 if mask.any():
360 data, fill_value = maybe_upcast(data, copy=True)
361 data[mask] = fill_value
362 else:
363 data = data.copy()
364 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
365 copy=copy)
366
367 elif isinstance(data, (np.ndarray, Series, Index)):
368 if data.dtype.names:
369 data_columns = list(data.dtype.names)
370 data = {k: data[k] for k in data_columns}
371 if columns is None:
372 columns = data_columns
373 mgr = self._init_dict(data, index, columns, dtype=dtype)
374 elif getattr(data, 'name', None) is not None:
375 mgr = self._init_dict({data.name: data}, index, columns,
376 dtype=dtype)
377 else:
378 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
379 copy=copy)
380 elif isinstance(data, (list, types.GeneratorType)):
381 if isinstance(data, types.GeneratorType):
382 data = list(data)
383 if len(data) > 0:
384 if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
385 if is_named_tuple(data[0]) and columns is None:
386 columns = data[0]._fields
387 arrays, columns = _to_arrays(data, columns, dtype=dtype)
388 columns = _ensure_index(columns)
389
390 # set the index
391 if index is None:
392 if isinstance(data[0], Series):
393 index = _get_names_from_index(data)
394 elif isinstance(data[0], Categorical):
395 index = com._default_index(len(data[0]))
396 else:
397 index = com._default_index(len(data))
398
399 mgr = _arrays_to_mgr(arrays, columns, index, columns,
400 dtype=dtype)
401 else:
402 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
403 copy=copy)
404 else:
405 mgr = self._init_dict({}, index, columns, dtype=dtype)
406 elif isinstance(data, collections.Iterator):
407 raise TypeError("data argument can't be an iterator")
408 else:
409 try:
410 arr = np.array(data, dtype=dtype, copy=copy)
411 except (ValueError, TypeError) as e:
412 exc = TypeError('DataFrame constructor called with '
413 'incompatible data and dtype: {e}'.format(e=e))
414 raise_with_traceback(exc)
415
416 if arr.ndim == 0 and index is not None and columns is not None:
417 values = cast_scalar_to_array((len(index), len(columns)),
418 data, dtype=dtype)
419 mgr = self._init_ndarray(values, index, columns,
420 dtype=values.dtype, copy=False)
421 else:
422 raise ValueError('DataFrame constructor not properly called!')
423
424 NDFrame.__init__(self, mgr, fastpath=True)
425
426 def _init_dict(self, data, index, columns, dtype=None):
427 """
428 Segregate Series based on type and coerce into matrices.
429 Needs to handle a lot of exceptional cases.
430 """
431 if columns is not None:
432 arrays = Series(data, index=columns, dtype=object)
433 data_names = arrays.index
434
435 missing = arrays.isnull()
436 if index is None:
437 # GH10856
438 # raise ValueError if only scalars in dict
439 index = extract_index(arrays[~missing])
440 else:
441 index = _ensure_index(index)
442
443 # no obvious "empty" int column
444 if missing.any() and not is_integer_dtype(dtype):
445 if dtype is None or np.issubdtype(dtype, np.flexible):
446 # 1783
447 nan_dtype = object
448 else:
449 nan_dtype = dtype
450 v = construct_1d_arraylike_from_scalar(np.nan, len(index),
451 nan_dtype)
452 arrays.loc[missing] = [v] * missing.sum()
453
454 else:
455 keys = com._dict_keys_to_ordered_list(data)
456 columns = data_names = Index(keys)
457 arrays = [data[k] for k in keys]
458
459 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
460
461 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
462 # input must be a ndarray, list, Series, index
463
464 if isinstance(values, Series):
465 if columns is None:
466 if values.name is not None:
467 columns = [values.name]
468 if index is None:
469 index = values.index
470 else:
471 values = values.reindex(index)
472
473 # zero len case (GH #2234)
474 if not len(values) and columns is not None and len(columns):
475 values = np.empty((0, 1), dtype=object)
476
477 # helper to create the axes as indexes
478 def _get_axes(N, K, index=index, columns=columns):
479 # return axes or defaults
480
481 if index is None:
482 index = com._default_index(N)
483 else:
484 index = _ensure_index(index)
485
486 if columns is None:
487 columns = com._default_index(K)
488 else:
489 columns = _ensure_index(columns)
490 return index, columns
491
492 # we could have a categorical type passed or coerced to 'category'
493 # recast this to an _arrays_to_mgr
494 if (is_categorical_dtype(getattr(values, 'dtype', None)) or
495 is_categorical_dtype(dtype)):
496
497 if not hasattr(values, 'dtype'):
498 values = _prep_ndarray(values, copy=copy)
499 values = values.ravel()
500 elif copy:
501 values = values.copy()
502
503 index, columns = _get_axes(len(values), 1)
504 return _arrays_to_mgr([values], columns, index, columns,
505 dtype=dtype)
506 elif (is_datetimetz(values) or is_extension_array_dtype(values)):
507 # GH19157
508 if columns is None:
509 columns = [0]
510 return _arrays_to_mgr([values], columns, index, columns,
511 dtype=dtype)
512
513 # by definition an array here
514 # the dtypes will be coerced to a single dtype
515 values = _prep_ndarray(values, copy=copy)
516
517 if dtype is not None:
518 if not is_dtype_equal(values.dtype, dtype):
519 try:
520 values = values.astype(dtype)
521 except Exception as orig:
522 e = ValueError("failed to cast to '{dtype}' (Exception "
523 "was: {orig})".format(dtype=dtype,
524 orig=orig))
525 raise_with_traceback(e)
526
527 index, columns = _get_axes(*values.shape)
528 values = values.T
529
530 # if we don't have a dtype specified, then try to convert objects
531 # on the entire block; this is to convert if we have datetimelike's
532 # embedded in an object type
533 if dtype is None and is_object_dtype(values):
534 values = maybe_infer_to_datetimelike(values)
535
536 return create_block_manager_from_blocks([values], [columns, index])
537
538 @property
539 def axes(self):
540 """
541 Return a list representing the axes of the DataFrame.
542
543 It has the row axis labels and column axis labels as the only members.
544 They are returned in that order.
545
546 Examples
547 --------
548 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
549 >>> df.axes
550 [RangeIndex(start=0, stop=2, step=1), Index(['coll', 'col2'],
551 dtype='object')]
552 """
553 return [self.index, self.columns]
554
555 @property
556 def shape(self):
557 """
558 Return a tuple representing the dimensionality of the DataFrame.
559
560 See Also
561 --------
562 ndarray.shape
563
564 Examples
565 --------
566 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
567 >>> df.shape
568 (2, 2)
569
570 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
571 ... 'col3': [5, 6]})
572 >>> df.shape
573 (2, 3)
574 """
575 return len(self.index), len(self.columns)
576
577 def _repr_fits_vertical_(self):
578 """
579 Check length against max_rows.
580 """
581 max_rows = get_option("display.max_rows")
582 return len(self) <= max_rows
583
584 def _repr_fits_horizontal_(self, ignore_width=False):
585 """
586 Check if full repr fits in horizontal boundaries imposed by the display
587 options width and max_columns. In case off non-interactive session, no
588 boundaries apply.
589
590 ignore_width is here so ipnb+HTML output can behave the way
591 users expect. display.max_columns remains in effect.
592 GH3541, GH3573
593 """
594
595 width, height = console.get_console_size()
596 max_columns = get_option("display.max_columns")
597 nb_columns = len(self.columns)
598
599 # exceed max columns
600 if ((max_columns and nb_columns > max_columns) or
601 ((not ignore_width) and width and nb_columns > (width // 2))):
602 return False
603
604 # used by repr_html under IPython notebook or scripts ignore terminal
605 # dims
606 if ignore_width or not com.in_interactive_session():
607 return True
608
609 if (get_option('display.width') is not None or
610 com.in_ipython_frontend()):
611 # check at least the column row for excessive width
612 max_rows = 1
613 else:
614 max_rows = get_option("display.max_rows")
615
616 # when auto-detecting, so width=None and not in ipython front end
617 # check whether repr fits horizontal by actually checking
618 # the width of the rendered repr
619 buf = StringIO()
620
621 # only care about the stuff we'll actually print out
622 # and to_string on entire frame may be expensive
623 d = self
624
625 if not (max_rows is None): # unlimited rows
626 # min of two, where one may be None
627 d = d.iloc[:min(max_rows, len(d))]
628 else:
629 return True
630
631 d.to_string(buf=buf)
632 value = buf.getvalue()
633 repr_width = max(len(l) for l in value.split('\n'))
634
635 return repr_width < width
636
637 def _info_repr(self):
638 """True if the repr should show the info view."""
639 info_repr_option = (get_option("display.large_repr") == "info")
640 return info_repr_option and not (self._repr_fits_horizontal_() and
641 self._repr_fits_vertical_())
642
643 def __unicode__(self):
644 """
645 Return a string representation for a particular DataFrame
646
647 Invoked by unicode(df) in py2 only. Yields a Unicode String in both
648 py2/py3.
649 """
650 buf = StringIO(u(""))
651 if self._info_repr():
652 self.info(buf=buf)
653 return buf.getvalue()
654
655 max_rows = get_option("display.max_rows")
656 max_cols = get_option("display.max_columns")
657 show_dimensions = get_option("display.show_dimensions")
658 if get_option("display.expand_frame_repr"):
659 width, _ = console.get_console_size()
660 else:
661 width = None
662 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
663 line_width=width, show_dimensions=show_dimensions)
664
665 return buf.getvalue()
666
667 def _repr_html_(self):
668 """
669 Return a html representation for a particular DataFrame.
670 Mainly for IPython notebook.
671 """
672 # qtconsole doesn't report its line width, and also
673 # behaves badly when outputting an HTML table
674 # that doesn't fit the window, so disable it.
675 # XXX: In IPython 3.x and above, the Qt console will not attempt to
676 # display HTML, so this check can be removed when support for
677 # IPython 2.x is no longer needed.
678 if com.in_qtconsole():
679 # 'HTML output is disabled in QtConsole'
680 return None
681
682 if self._info_repr():
683 buf = StringIO(u(""))
684 self.info(buf=buf)
685 # need to escape the <class>, should be the first line.
686 val = buf.getvalue().replace('<', r'<', 1)
687 val = val.replace('>', r'>', 1)
688 return '<pre>' + val + '</pre>'
689
690 if get_option("display.notebook_repr_html"):
691 max_rows = get_option("display.max_rows")
692 max_cols = get_option("display.max_columns")
693 show_dimensions = get_option("display.show_dimensions")
694
695 return self.to_html(max_rows=max_rows, max_cols=max_cols,
696 show_dimensions=show_dimensions, notebook=True)
697 else:
698 return None
699
700 @property
701 def style(self):
702 """
703 Property returning a Styler object containing methods for
704 building a styled HTML representation fo the DataFrame.
705
706 See Also
707 --------
708 pandas.io.formats.style.Styler
709 """
710 from pandas.io.formats.style import Styler
711 return Styler(self)
712
713 def iteritems(self):
714 """
715 Iterator over (column name, Series) pairs.
716
717 See also
718 --------
719 iterrows : Iterate over DataFrame rows as (index, Series) pairs.
720 itertuples : Iterate over DataFrame rows as namedtuples of the values.
721
722 """
723 if self.columns.is_unique and hasattr(self, '_item_cache'):
724 for k in self.columns:
725 yield k, self._get_item_cache(k)
726 else:
727 for i, k in enumerate(self.columns):
728 yield k, self._ixs(i, axis=1)
729
730 def iterrows(self):
731 """
732 Iterate over DataFrame rows as (index, Series) pairs.
733
734 Notes
735 -----
736
737 1. Because ``iterrows`` returns a Series for each row,
738 it does **not** preserve dtypes across the rows (dtypes are
739 preserved across columns for DataFrames). For example,
740
741 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
742 >>> row = next(df.iterrows())[1]
743 >>> row
744 int 1.0
745 float 1.5
746 Name: 0, dtype: float64
747 >>> print(row['int'].dtype)
748 float64
749 >>> print(df['int'].dtype)
750 int64
751
752 To preserve dtypes while iterating over the rows, it is better
753 to use :meth:`itertuples` which returns namedtuples of the values
754 and which is generally faster than ``iterrows``.
755
756 2. You should **never modify** something you are iterating over.
757 This is not guaranteed to work in all cases. Depending on the
758 data types, the iterator returns a copy and not a view, and writing
759 to it will have no effect.
760
761 Returns
762 -------
763 it : generator
764 A generator that iterates over the rows of the frame.
765
766 See also
767 --------
768 itertuples : Iterate over DataFrame rows as namedtuples of the values.
769 iteritems : Iterate over (column name, Series) pairs.
770
771 """
772 columns = self.columns
773 klass = self._constructor_sliced
774 for k, v in zip(self.index, self.values):
775 s = klass(v, index=columns, name=k)
776 yield k, s
777
778 def itertuples(self, index=True, name="Pandas"):
779 """
780 Iterate over DataFrame rows as namedtuples, with index value as first
781 element of the tuple.
782
783 Parameters
784 ----------
785 index : boolean, default True
786 If True, return the index as the first element of the tuple.
787 name : string, default "Pandas"
788 The name of the returned namedtuples or None to return regular
789 tuples.
790
791 Notes
792 -----
793 The column names will be renamed to positional names if they are
794 invalid Python identifiers, repeated, or start with an underscore.
795 With a large number of columns (>255), regular tuples are returned.
796
797 See also
798 --------
799 iterrows : Iterate over DataFrame rows as (index, Series) pairs.
800 iteritems : Iterate over (column name, Series) pairs.
801
802 Examples
803 --------
804
805 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2]},
806 index=['a', 'b'])
807 >>> df
808 col1 col2
809 a 1 0.1
810 b 2 0.2
811 >>> for row in df.itertuples():
812 ... print(row)
813 ...
814 Pandas(Index='a', col1=1, col2=0.10000000000000001)
815 Pandas(Index='b', col1=2, col2=0.20000000000000001)
816
817 """
818 arrays = []
819 fields = []
820 if index:
821 arrays.append(self.index)
822 fields.append("Index")
823
824 # use integer indexing because of possible duplicate column names
825 arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
826
827 # Python 3 supports at most 255 arguments to constructor, and
828 # things get slow with this many fields in Python 2
829 if name is not None and len(self.columns) + index < 256:
830 # `rename` is unsupported in Python 2.6
831 try:
832 itertuple = collections.namedtuple(name,
833 fields + list(self.columns),
834 rename=True)
835 return map(itertuple._make, zip(*arrays))
836 except Exception:
837 pass
838
839 # fallback to regular tuples
840 return zip(*arrays)
841
842 items = iteritems
843
844 def __len__(self):
845 """Returns length of info axis, but here we use the index """
846 return len(self.index)
847
848 def dot(self, other):
849 """
850 Matrix multiplication with DataFrame or Series objects. Can also be
851 called using `self @ other` in Python >= 3.5.
852
853 Parameters
854 ----------
855 other : DataFrame or Series
856
857 Returns
858 -------
859 dot_product : DataFrame or Series
860 """
861 if isinstance(other, (Series, DataFrame)):
862 common = self.columns.union(other.index)
863 if (len(common) > len(self.columns) or
864 len(common) > len(other.index)):
865 raise ValueError('matrices are not aligned')
866
867 left = self.reindex(columns=common, copy=False)
868 right = other.reindex(index=common, copy=False)
869 lvals = left.values
870 rvals = right.values
871 else:
872 left = self
873 lvals = self.values
874 rvals = np.asarray(other)
875 if lvals.shape[1] != rvals.shape[0]:
876 raise ValueError('Dot product shape mismatch, '
877 '{l} vs {r}'.format(l=lvals.shape,
878 r=rvals.shape))
879
880 if isinstance(other, DataFrame):
881 return self._constructor(np.dot(lvals, rvals), index=left.index,
882 columns=other.columns)
883 elif isinstance(other, Series):
884 return Series(np.dot(lvals, rvals), index=left.index)
885 elif isinstance(rvals, (np.ndarray, Index)):
886 result = np.dot(lvals, rvals)
887 if result.ndim == 2:
888 return self._constructor(result, index=left.index)
889 else:
890 return Series(result, index=left.index)
891 else: # pragma: no cover
892 raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
893
894 def __matmul__(self, other):
895 """ Matrix multiplication using binary `@` operator in Python>=3.5 """
896 return self.dot(other)
897
898 def __rmatmul__(self, other):
899 """ Matrix multiplication using binary `@` operator in Python>=3.5 """
900 return self.T.dot(np.transpose(other)).T
901
902 # ----------------------------------------------------------------------
903 # IO methods (to / from other formats)
904
905 @classmethod
906 def from_dict(cls, data, orient='columns', dtype=None, columns=None):
907 """
908 Construct DataFrame from dict of array-like or dicts.
909
910 Creates DataFrame object from dictionary by columns or by index
911 allowing dtype specification.
912
913 Parameters
914 ----------
915 data : dict
916 Of the form {field : array-like} or {field : dict}.
917 orient : {'columns', 'index'}, default 'columns'
918 The "orientation" of the data. If the keys of the passed dict
919 should be the columns of the resulting DataFrame, pass 'columns'
920 (default). Otherwise if the keys should be rows, pass 'index'.
921 dtype : dtype, default None
922 Data type to force, otherwise infer.
923 columns : list, default None
924 Column labels to use when ``orient='index'``. Raises a ValueError
925 if used with ``orient='columns'``.
926
927 .. versionadded:: 0.23.0
928
929 Returns
930 -------
931 pandas.DataFrame
932
933 See Also
934 --------
935 DataFrame.from_records : DataFrame from ndarray (structured
936 dtype), list of tuples, dict, or DataFrame
937 DataFrame : DataFrame object creation using constructor
938
939 Examples
940 --------
941 By default the keys of the dict become the DataFrame columns:
942
943 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
944 >>> pd.DataFrame.from_dict(data)
945 col_1 col_2
946 0 3 a
947 1 2 b
948 2 1 c
949 3 0 d
950
951 Specify ``orient='index'`` to create the DataFrame using dictionary
952 keys as rows:
953
954 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
955 >>> pd.DataFrame.from_dict(data, orient='index')
956 0 1 2 3
957 row_1 3 2 1 0
958 row_2 a b c d
959
960 When using the 'index' orientation, the column names can be
961 specified manually:
962
963 >>> pd.DataFrame.from_dict(data, orient='index',
964 ... columns=['A', 'B', 'C', 'D'])
965 A B C D
966 row_1 3 2 1 0
967 row_2 a b c d
968 """
969 index = None
970 orient = orient.lower()
971 if orient == 'index':
972 if len(data) > 0:
973 # TODO speed up Series case
974 if isinstance(list(data.values())[0], (Series, dict)):
975 data = _from_nested_dict(data)
976 else:
977 data, index = list(data.values()), list(data.keys())
978 elif orient == 'columns':
979 if columns is not None:
980 raise ValueError("cannot use columns parameter with "
981 "orient='columns'")
982 else: # pragma: no cover
983 raise ValueError('only recognize index or columns for orient')
984
985 return cls(data, index=index, columns=columns, dtype=dtype)
986
987 def to_dict(self, orient='dict', into=dict):
988 """
989 Convert the DataFrame to a dictionary.
990
991 The type of the key-value pairs can be customized with the parameters
992 (see below).
993
994 Parameters
995 ----------
996 orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
997 Determines the type of the values of the dictionary.
998
999 - 'dict' (default) : dict like {column -> {index -> value}}
1000 - 'list' : dict like {column -> [values]}
1001 - 'series' : dict like {column -> Series(values)}
1002 - 'split' : dict like
1003 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
1004 - 'records' : list like
1005 [{column -> value}, ... , {column -> value}]
1006 - 'index' : dict like {index -> {column -> value}}
1007
1008 Abbreviations are allowed. `s` indicates `series` and `sp`
1009 indicates `split`.
1010
1011 into : class, default dict
1012 The collections.Mapping subclass used for all Mappings
1013 in the return value. Can be the actual class or an empty
1014 instance of the mapping type you want. If you want a
1015 collections.defaultdict, you must pass it initialized.
1016
1017 .. versionadded:: 0.21.0
1018
1019 Returns
1020 -------
1021 result : collections.Mapping like {column -> {index -> value}}
1022
1023 See Also
1024 --------
1025 DataFrame.from_dict: create a DataFrame from a dictionary
1026 DataFrame.to_json: convert a DataFrame to JSON format
1027
1028 Examples
1029 --------
1030 >>> df = pd.DataFrame({'col1': [1, 2],
1031 ... 'col2': [0.5, 0.75]},
1032 ... index=['a', 'b'])
1033 >>> df
1034 col1 col2
1035 a 1 0.50
1036 b 2 0.75
1037 >>> df.to_dict()
1038 {'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}}
1039
1040 You can specify the return orientation.
1041
1042 >>> df.to_dict('series')
1043 {'col1': a 1
1044 b 2
1045 Name: col1, dtype: int64,
1046 'col2': a 0.50
1047 b 0.75
1048 Name: col2, dtype: float64}
1049
1050 >>> df.to_dict('split')
1051 {'index': ['a', 'b'], 'columns': ['col1', 'col2'],
1052 'data': [[1.0, 0.5], [2.0, 0.75]]}
1053
1054 >>> df.to_dict('records')
1055 [{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}]
1056
1057 >>> df.to_dict('index')
1058 {'a': {'col1': 1.0, 'col2': 0.5}, 'b': {'col1': 2.0, 'col2': 0.75}}
1059
1060 You can also specify the mapping type.
1061
1062 >>> from collections import OrderedDict, defaultdict
1063 >>> df.to_dict(into=OrderedDict)
1064 OrderedDict([('col1', OrderedDict([('a', 1), ('b', 2)])),
1065 ('col2', OrderedDict([('a', 0.5), ('b', 0.75)]))])
1066
1067 If you want a `defaultdict`, you need to initialize it:
1068
1069 >>> dd = defaultdict(list)
1070 >>> df.to_dict('records', into=dd)
1071 [defaultdict(<class 'list'>, {'col1': 1.0, 'col2': 0.5}),
1072 defaultdict(<class 'list'>, {'col1': 2.0, 'col2': 0.75})]
1073 """
1074 if not self.columns.is_unique:
1075 warnings.warn("DataFrame columns are not unique, some "
1076 "columns will be omitted.", UserWarning,
1077 stacklevel=2)
1078 # GH16122
1079 into_c = com.standardize_mapping(into)
1080 if orient.lower().startswith('d'):
1081 return into_c(
1082 (k, v.to_dict(into)) for k, v in compat.iteritems(self))
1083 elif orient.lower().startswith('l'):
1084 return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
1085 elif orient.lower().startswith('sp'):
1086 return into_c((('index', self.index.tolist()),
1087 ('columns', self.columns.tolist()),
1088 ('data', lib.map_infer(self.values.ravel(),
1089 com._maybe_box_datetimelike)
1090 .reshape(self.values.shape).tolist())))
1091 elif orient.lower().startswith('s'):
1092 return into_c((k, com._maybe_box_datetimelike(v))
1093 for k, v in compat.iteritems(self))
1094 elif orient.lower().startswith('r'):
1095 return [into_c((k, com._maybe_box_datetimelike(v))
1096 for k, v in zip(self.columns, np.atleast_1d(row)))
1097 for row in self.values]
1098 elif orient.lower().startswith('i'):
1099 return into_c((t[0], dict(zip(self.columns, t[1:])))
1100 for t in self.itertuples())
1101 else:
1102 raise ValueError("orient '{o}' not understood".format(o=orient))
1103
1104 def to_gbq(self, destination_table, project_id, chunksize=None,
1105 verbose=None, reauth=False, if_exists='fail', private_key=None,
1106 auth_local_webserver=False, table_schema=None):
1107 """
1108 Write a DataFrame to a Google BigQuery table.
1109
1110 This function requires the `pandas-gbq package
1111 <https://pandas-gbq.readthedocs.io>`__.
1112
1113 Authentication to the Google BigQuery service is via OAuth 2.0.
1114
1115 - If ``private_key`` is provided, the library loads the JSON service
1116 account credentials and uses those to authenticate.
1117
1118 - If no ``private_key`` is provided, the library tries `application
1119 default credentials`_.
1120
1121 .. _application default credentials:
1122 https://cloud.google.com/docs/authentication/production#providing_credentials_to_your_application
1123
1124 - If application default credentials are not found or cannot be used
1125 with BigQuery, the library authenticates with user account
1126 credentials. In this case, you will be asked to grant permissions
1127 for product name 'pandas GBQ'.
1128
1129 Parameters
1130 ----------
1131 destination_table : str
1132 Name of table to be written, in the form 'dataset.tablename'.
1133 project_id : str
1134 Google BigQuery Account project ID.
1135 chunksize : int, optional
1136 Number of rows to be inserted in each chunk from the dataframe.
1137 Set to ``None`` to load the whole dataframe at once.
1138 reauth : bool, default False
1139 Force Google BigQuery to reauthenticate the user. This is useful
1140 if multiple accounts are used.
1141 if_exists : str, default 'fail'
1142 Behavior when the destination table exists. Value can be one of:
1143
1144 ``'fail'``
1145 If table exists, do nothing.
1146 ``'replace'``
1147 If table exists, drop it, recreate it, and insert data.
1148 ``'append'``
1149 If table exists, insert data. Create if does not exist.
1150 private_key : str, optional
1151 Service account private key in JSON format. Can be file path
1152 or string contents. This is useful for remote server
1153 authentication (eg. Jupyter/IPython notebook on remote host).
1154 auth_local_webserver : bool, default False
1155 Use the `local webserver flow`_ instead of the `console flow`_
1156 when getting user credentials.
1157
1158 .. _local webserver flow:
1159 http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
1160 .. _console flow:
1161 http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
1162
1163 *New in version 0.2.0 of pandas-gbq*.
1164 table_schema : list of dicts, optional
1165 List of BigQuery table fields to which according DataFrame
1166 columns conform to, e.g. ``[{'name': 'col1', 'type':
1167 'STRING'},...]``. If schema is not provided, it will be
1168 generated according to dtypes of DataFrame columns. See
1169 BigQuery API documentation on available names of a field.
1170
1171 *New in version 0.3.1 of pandas-gbq*.
1172 verbose : boolean, deprecated
1173 *Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module
1174 to adjust verbosity instead
1175 <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
1176
1177 See Also
1178 --------
1179 pandas_gbq.to_gbq : This function in the pandas-gbq library.
1180 pandas.read_gbq : Read a DataFrame from Google BigQuery.
1181 """
1182 from pandas.io import gbq
1183 return gbq.to_gbq(
1184 self, destination_table, project_id, chunksize=chunksize,
1185 verbose=verbose, reauth=reauth, if_exists=if_exists,
1186 private_key=private_key, auth_local_webserver=auth_local_webserver,
1187 table_schema=table_schema)
1188
1189 @classmethod
1190 def from_records(cls, data, index=None, exclude=None, columns=None,
1191 coerce_float=False, nrows=None):
1192 """
1193 Convert structured or record ndarray to DataFrame
1194
1195 Parameters
1196 ----------
1197 data : ndarray (structured dtype), list of tuples, dict, or DataFrame
1198 index : string, list of fields, array-like
1199 Field of array to use as the index, alternately a specific set of
1200 input labels to use
1201 exclude : sequence, default None
1202 Columns or fields to exclude
1203 columns : sequence, default None
1204 Column names to use. If the passed data do not have names
1205 associated with them, this argument provides names for the
1206 columns. Otherwise this argument indicates the order of the columns
1207 in the result (any names not found in the data will become all-NA
1208 columns)
1209 coerce_float : boolean, default False
1210 Attempt to convert values of non-string, non-numeric objects (like
1211 decimal.Decimal) to floating point, useful for SQL result sets
1212
1213 Returns
1214 -------
1215 df : DataFrame
1216 """
1217
1218 # Make a copy of the input columns so we can modify it
1219 if columns is not None:
1220 columns = _ensure_index(columns)
1221
1222 if is_iterator(data):
1223 if nrows == 0:
1224 return cls()
1225
1226 try:
1227 first_row = next(data)
1228 except StopIteration:
1229 return cls(index=index, columns=columns)
1230
1231 dtype = None
1232 if hasattr(first_row, 'dtype') and first_row.dtype.names:
1233 dtype = first_row.dtype
1234
1235 values = [first_row]
1236
1237 if nrows is None:
1238 values += data
1239 else:
1240 values.extend(itertools.islice(data, nrows - 1))
1241
1242 if dtype is not None:
1243 data = np.array(values, dtype=dtype)
1244 else:
1245 data = values
1246
1247 if isinstance(data, dict):
1248 if columns is None:
1249 columns = arr_columns = _ensure_index(sorted(data))
1250 arrays = [data[k] for k in columns]
1251 else:
1252 arrays = []
1253 arr_columns = []
1254 for k, v in compat.iteritems(data):
1255 if k in columns:
1256 arr_columns.append(k)
1257 arrays.append(v)
1258
1259 arrays, arr_columns = _reorder_arrays(arrays, arr_columns,
1260 columns)
1261
1262 elif isinstance(data, (np.ndarray, DataFrame)):
1263 arrays, columns = _to_arrays(data, columns)
1264 if columns is not None:
1265 columns = _ensure_index(columns)
1266 arr_columns = columns
1267 else:
1268 arrays, arr_columns = _to_arrays(data, columns,
1269 coerce_float=coerce_float)
1270
1271 arr_columns = _ensure_index(arr_columns)
1272 if columns is not None:
1273 columns = _ensure_index(columns)
1274 else:
1275 columns = arr_columns
1276
1277 if exclude is None:
1278 exclude = set()
1279 else:
1280 exclude = set(exclude)
1281
1282 result_index = None
1283 if index is not None:
1284 if (isinstance(index, compat.string_types) or
1285 not hasattr(index, "__iter__")):
1286 i = columns.get_loc(index)
1287 exclude.add(index)
1288 if len(arrays) > 0:
1289 result_index = Index(arrays[i], name=index)
1290 else:
1291 result_index = Index([], name=index)
1292 else:
1293 try:
1294 to_remove = [arr_columns.get_loc(field) for field in index]
1295 index_data = [arrays[i] for i in to_remove]
1296 result_index = _ensure_index_from_sequences(index_data,
1297 names=index)
1298
1299 exclude.update(index)
1300 except Exception:
1301 result_index = index
1302
1303 if any(exclude):
1304 arr_exclude = [x for x in exclude if x in arr_columns]
1305 to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
1306 arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
1307
1308 arr_columns = arr_columns.drop(arr_exclude)
1309 columns = columns.drop(exclude)
1310
1311 mgr = _arrays_to_mgr(arrays, arr_columns, result_index, columns)
1312
1313 return cls(mgr)
1314
1315 def to_records(self, index=True, convert_datetime64=None):
1316 """
1317 Convert DataFrame to a NumPy record array.
1318
1319 Index will be put in the 'index' field of the record array if
1320 requested.
1321
1322 Parameters
1323 ----------
1324 index : boolean, default True
1325 Include index in resulting record array, stored in 'index' field.
1326 convert_datetime64 : boolean, default None
1327 .. deprecated:: 0.23.0
1328
1329 Whether to convert the index to datetime.datetime if it is a
1330 DatetimeIndex.
1331
1332 Returns
1333 -------
1334 y : numpy.recarray
1335
1336 See Also
1337 --------
1338 DataFrame.from_records: convert structured or record ndarray
1339 to DataFrame.
1340 numpy.recarray: ndarray that allows field access using
1341 attributes, analogous to typed columns in a
1342 spreadsheet.
1343
1344 Examples
1345 --------
1346 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
1347 ... index=['a', 'b'])
1348 >>> df
1349 A B
1350 a 1 0.50
1351 b 2 0.75
1352 >>> df.to_records()
1353 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1354 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1355
1356 The index can be excluded from the record array:
1357
1358 >>> df.to_records(index=False)
1359 rec.array([(1, 0.5 ), (2, 0.75)],
1360 dtype=[('A', '<i8'), ('B', '<f8')])
1361
1362 By default, timestamps are converted to `datetime.datetime`:
1363
1364 >>> df.index = pd.date_range('2018-01-01 09:00', periods=2, freq='min')
1365 >>> df
1366 A B
1367 2018-01-01 09:00:00 1 0.50
1368 2018-01-01 09:01:00 2 0.75
1369 >>> df.to_records()
1370 rec.array([(datetime.datetime(2018, 1, 1, 9, 0), 1, 0.5 ),
1371 (datetime.datetime(2018, 1, 1, 9, 1), 2, 0.75)],
1372 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1373
1374 The timestamp conversion can be disabled so NumPy's datetime64
1375 data type is used instead:
1376
1377 >>> df.to_records(convert_datetime64=False)
1378 rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
1379 ('2018-01-01T09:01:00.000000000', 2, 0.75)],
1380 dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
1381 """
1382
1383 if convert_datetime64 is not None:
1384 warnings.warn("The 'convert_datetime64' parameter is "
1385 "deprecated and will be removed in a future "
1386 "version",
1387 FutureWarning, stacklevel=2)
1388
1389 if index:
1390 if is_datetime64_any_dtype(self.index) and convert_datetime64:
1391 ix_vals = [self.index.to_pydatetime()]
1392 else:
1393 if isinstance(self.index, MultiIndex):
1394 # array of tuples to numpy cols. copy copy copy
1395 ix_vals = lmap(np.array, zip(*self.index.values))
1396 else:
1397 ix_vals = [self.index.values]
1398
1399 arrays = ix_vals + [self[c].get_values() for c in self.columns]
1400
1401 count = 0
1402 index_names = list(self.index.names)
1403 if isinstance(self.index, MultiIndex):
1404 for i, n in enumerate(index_names):
1405 if n is None:
1406 index_names[i] = 'level_%d' % count
1407 count += 1
1408 elif index_names[0] is None:
1409 index_names = ['index']
1410 names = (lmap(compat.text_type, index_names) +
1411 lmap(compat.text_type, self.columns))
1412 else:
1413 arrays = [self[c].get_values() for c in self.columns]
1414 names = lmap(compat.text_type, self.columns)
1415
1416 formats = [v.dtype for v in arrays]
1417 return np.rec.fromarrays(
1418 arrays,
1419 dtype={'names': names, 'formats': formats}
1420 )
1421
1422 @classmethod
1423 def from_items(cls, items, columns=None, orient='columns'):
1424 """Construct a dataframe from a list of tuples
1425
1426 .. deprecated:: 0.23.0
1427 `from_items` is deprecated and will be removed in a future version.
1428 Use :meth:`DataFrame.from_dict(dict(items)) <DataFrame.from_dict>`
1429 instead.
1430 :meth:`DataFrame.from_dict(OrderedDict(items)) <DataFrame.from_dict>`
1431 may be used to preserve the key order.
1432
1433 Convert (key, value) pairs to DataFrame. The keys will be the axis
1434 index (usually the columns, but depends on the specified
1435 orientation). The values should be arrays or Series.
1436
1437 Parameters
1438 ----------
1439 items : sequence of (key, value) pairs
1440 Values should be arrays or Series.
1441 columns : sequence of column labels, optional
1442 Must be passed if orient='index'.
1443 orient : {'columns', 'index'}, default 'columns'
1444 The "orientation" of the data. If the keys of the
1445 input correspond to column labels, pass 'columns'
1446 (default). Otherwise if the keys correspond to the index,
1447 pass 'index'.
1448
1449 Returns
1450 -------
1451 frame : DataFrame
1452 """
1453
1454 warnings.warn("from_items is deprecated. Please use "
1455 "DataFrame.from_dict(dict(items), ...) instead. "
1456 "DataFrame.from_dict(OrderedDict(items)) may be used to "
1457 "preserve the key order.",
1458 FutureWarning, stacklevel=2)
1459
1460 keys, values = lzip(*items)
1461
1462 if orient == 'columns':
1463 if columns is not None:
1464 columns = _ensure_index(columns)
1465
1466 idict = dict(items)
1467 if len(idict) < len(items):
1468 if not columns.equals(_ensure_index(keys)):
1469 raise ValueError('With non-unique item names, passed '
1470 'columns must be identical')
1471 arrays = values
1472 else:
1473 arrays = [idict[k] for k in columns if k in idict]
1474 else:
1475 columns = _ensure_index(keys)
1476 arrays = values
1477
1478 # GH 17312
1479 # Provide more informative error msg when scalar values passed
1480 try:
1481 return cls._from_arrays(arrays, columns, None)
1482
1483 except ValueError:
1484 if not is_nested_list_like(values):
1485 raise ValueError('The value in each (key, value) pair '
1486 'must be an array, Series, or dict')
1487
1488 elif orient == 'index':
1489 if columns is None:
1490 raise TypeError("Must pass columns with orient='index'")
1491
1492 keys = _ensure_index(keys)
1493
1494 # GH 17312
1495 # Provide more informative error msg when scalar values passed
1496 try:
1497 arr = np.array(values, dtype=object).T
1498 data = [lib.maybe_convert_objects(v) for v in arr]
1499 return cls._from_arrays(data, columns, keys)
1500
1501 except TypeError:
1502 if not is_nested_list_like(values):
1503 raise ValueError('The value in each (key, value) pair '
1504 'must be an array, Series, or dict')
1505
1506 else: # pragma: no cover
1507 raise ValueError("'orient' must be either 'columns' or 'index'")
1508
1509 @classmethod
1510 def _from_arrays(cls, arrays, columns, index, dtype=None):
1511 mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
1512 return cls(mgr)
1513
1514 @classmethod
1515 def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True,
1516 encoding=None, tupleize_cols=None,
1517 infer_datetime_format=False):
1518 """Read CSV file.
1519
1520 .. deprecated:: 0.21.0
1521 Use :func:`pandas.read_csv` instead.
1522
1523 It is preferable to use the more powerful :func:`pandas.read_csv`
1524 for most general purposes, but ``from_csv`` makes for an easy
1525 roundtrip to and from a file (the exact counterpart of
1526 ``to_csv``), especially with a DataFrame of time series data.
1527
1528 This method only differs from the preferred :func:`pandas.read_csv`
1529 in some defaults:
1530
1531 - `index_col` is ``0`` instead of ``None`` (take first column as index
1532 by default)
1533 - `parse_dates` is ``True`` instead of ``False`` (try parsing the index
1534 as datetime by default)
1535
1536 So a ``pd.DataFrame.from_csv(path)`` can be replaced by
1537 ``pd.read_csv(path, index_col=0, parse_dates=True)``.
1538
1539 Parameters
1540 ----------
1541 path : string file path or file handle / StringIO
1542 header : int, default 0
1543 Row to use as header (skip prior rows)
1544 sep : string, default ','
1545 Field delimiter
1546 index_col : int or sequence, default 0
1547 Column to use for index. If a sequence is given, a MultiIndex
1548 is used. Different default from read_table
1549 parse_dates : boolean, default True
1550 Parse dates. Different default from read_table
1551 tupleize_cols : boolean, default False
1552 write multi_index columns as a list of tuples (if True)
1553 or new (expanded format) if False)
1554 infer_datetime_format: boolean, default False
1555 If True and `parse_dates` is True for a column, try to infer the
1556 datetime format based on the first datetime string. If the format
1557 can be inferred, there often will be a large parsing speed-up.
1558
1559 See also
1560 --------
1561 pandas.read_csv
1562
1563 Returns
1564 -------
1565 y : DataFrame
1566
1567 """
1568
1569 warnings.warn("from_csv is deprecated. Please use read_csv(...) "
1570 "instead. Note that some of the default arguments are "
1571 "different, so please refer to the documentation "
1572 "for from_csv when changing your function calls",
1573 FutureWarning, stacklevel=2)
1574
1575 from pandas.io.parsers import read_table
1576 return read_table(path, header=header, sep=sep,
1577 parse_dates=parse_dates, index_col=index_col,
1578 encoding=encoding, tupleize_cols=tupleize_cols,
1579 infer_datetime_format=infer_datetime_format)
1580
1581 def to_sparse(self, fill_value=None, kind='block'):
1582 """
1583 Convert to SparseDataFrame
1584
1585 Parameters
1586 ----------
1587 fill_value : float, default NaN
1588 kind : {'block', 'integer'}
1589
1590 Returns
1591 -------
1592 y : SparseDataFrame
1593 """
1594 from pandas.core.sparse.frame import SparseDataFrame
1595 return SparseDataFrame(self._series, index=self.index,
1596 columns=self.columns, default_kind=kind,
1597 default_fill_value=fill_value)
1598
1599 def to_panel(self):
1600 """
1601 Transform long (stacked) format (DataFrame) into wide (3D, Panel)
1602 format.
1603
1604 .. deprecated:: 0.20.0
1605
1606 Currently the index of the DataFrame must be a 2-level MultiIndex. This
1607 may be generalized later
1608
1609 Returns
1610 -------
1611 panel : Panel
1612 """
1613 # only support this kind for now
1614 if (not isinstance(self.index, MultiIndex) or # pragma: no cover
1615 len(self.index.levels) != 2):
1616 raise NotImplementedError('Only 2-level MultiIndex are supported.')
1617
1618 if not self.index.is_unique:
1619 raise ValueError("Can't convert non-uniquely indexed "
1620 "DataFrame to Panel")
1621
1622 self._consolidate_inplace()
1623
1624 # minor axis must be sorted
1625 if self.index.lexsort_depth < 2:
1626 selfsorted = self.sort_index(level=0)
1627 else:
1628 selfsorted = self
1629
1630 major_axis, minor_axis = selfsorted.index.levels
1631 major_labels, minor_labels = selfsorted.index.labels
1632 shape = len(major_axis), len(minor_axis)
1633
1634 # preserve names, if any
1635 major_axis = major_axis.copy()
1636 major_axis.name = self.index.names[0]
1637
1638 minor_axis = minor_axis.copy()
1639 minor_axis.name = self.index.names[1]
1640
1641 # create new axes
1642 new_axes = [selfsorted.columns, major_axis, minor_axis]
1643
1644 # create new manager
1645 new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
1646 labels=[major_labels,
1647 minor_labels],
1648 shape=shape,
1649 ref_items=selfsorted.columns)
1650
1651 return self._constructor_expanddim(new_mgr)
1652
1653 def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
1654 columns=None, header=True, index=True, index_label=None,
1655 mode='w', encoding=None, compression=None, quoting=None,
1656 quotechar='"', line_terminator='\n', chunksize=None,
1657 tupleize_cols=None, date_format=None, doublequote=True,
1658 escapechar=None, decimal='.'):
1659 r"""Write DataFrame to a comma-separated values (csv) file
1660
1661 Parameters
1662 ----------
1663 path_or_buf : string or file handle, default None
1664 File path or object, if None is provided the result is returned as
1665 a string.
1666 sep : character, default ','
1667 Field delimiter for the output file.
1668 na_rep : string, default ''
1669 Missing data representation
1670 float_format : string, default None
1671 Format string for floating point numbers
1672 columns : sequence, optional
1673 Columns to write
1674 header : boolean or list of string, default True
1675 Write out the column names. If a list of strings is given it is
1676 assumed to be aliases for the column names
1677 index : boolean, default True
1678 Write row names (index)
1679 index_label : string or sequence, or False, default None
1680 Column label for index column(s) if desired. If None is given, and
1681 `header` and `index` are True, then the index names are used. A
1682 sequence should be given if the DataFrame uses MultiIndex. If
1683 False do not print fields for index names. Use index_label=False
1684 for easier importing in R
1685 mode : str
1686 Python write mode, default 'w'
1687 encoding : string, optional
1688 A string representing the encoding to use in the output file,
1689 defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
1690 compression : string, optional
1691 A string representing the compression to use in the output file.
1692 Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
1693 used when the first argument is a filename.
1694 line_terminator : string, default ``'\n'``
1695 The newline character or character sequence to use in the output
1696 file
1697 quoting : optional constant from csv module
1698 defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
1699 then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
1700 will treat them as non-numeric
1701 quotechar : string (length 1), default '\"'
1702 character used to quote fields
1703 doublequote : boolean, default True
1704 Control quoting of `quotechar` inside a field
1705 escapechar : string (length 1), default None
1706 character used to escape `sep` and `quotechar` when appropriate
1707 chunksize : int or None
1708 rows to write at a time
1709 tupleize_cols : boolean, default False
1710 .. deprecated:: 0.21.0
1711 This argument will be removed and will always write each row
1712 of the multi-index as a separate row in the CSV file.
1713
1714 Write MultiIndex columns as a list of tuples (if True) or in
1715 the new, expanded format, where each MultiIndex column is a row
1716 in the CSV (if False).
1717 date_format : string, default None
1718 Format string for datetime objects
1719 decimal: string, default '.'
1720 Character recognized as decimal separator. E.g. use ',' for
1721 European data
1722
1723 """
1724
1725 if tupleize_cols is not None:
1726 warnings.warn("The 'tupleize_cols' parameter is deprecated and "
1727 "will be removed in a future version",
1728 FutureWarning, stacklevel=2)
1729 else:
1730 tupleize_cols = False
1731
1732 from pandas.io.formats.csvs import CSVFormatter
1733 formatter = CSVFormatter(self, path_or_buf,
1734 line_terminator=line_terminator, sep=sep,
1735 encoding=encoding,
1736 compression=compression, quoting=quoting,
1737 na_rep=na_rep, float_format=float_format,
1738 cols=columns, header=header, index=index,
1739 index_label=index_label, mode=mode,
1740 chunksize=chunksize, quotechar=quotechar,
1741 tupleize_cols=tupleize_cols,
1742 date_format=date_format,
1743 doublequote=doublequote,
1744 escapechar=escapechar, decimal=decimal)
1745 formatter.save()
1746
1747 if path_or_buf is None:
1748 return formatter.path_or_buf.getvalue()
1749
1750 @Appender(_shared_docs['to_excel'] % _shared_doc_kwargs)
1751 def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
1752 float_format=None, columns=None, header=True, index=True,
1753 index_label=None, startrow=0, startcol=0, engine=None,
1754 merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
1755 freeze_panes=None):
1756
1757 from pandas.io.formats.excel import ExcelFormatter
1758 formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns,
1759 header=header,
1760 float_format=float_format, index=index,
1761 index_label=index_label,
1762 merge_cells=merge_cells,
1763 inf_rep=inf_rep)
1764 formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
1765 startcol=startcol, freeze_panes=freeze_panes,
1766 engine=engine)
1767
1768 def to_stata(self, fname, convert_dates=None, write_index=True,
1769 encoding="latin-1", byteorder=None, time_stamp=None,
1770 data_label=None, variable_labels=None, version=114,
1771 convert_strl=None):
1772 """
1773 Export Stata binary dta files.
1774
1775 Parameters
1776 ----------
1777 fname : path (string), buffer or path object
1778 string, path object (pathlib.Path or py._path.local.LocalPath) or
1779 object implementing a binary write() functions. If using a buffer
1780 then the buffer will not be automatically closed after the file
1781 data has been written.
1782 convert_dates : dict
1783 Dictionary mapping columns containing datetime types to stata
1784 internal format to use when writing the dates. Options are 'tc',
1785 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
1786 or a name. Datetime columns that do not have a conversion type
1787 specified will be converted to 'tc'. Raises NotImplementedError if
1788 a datetime column has timezone information.
1789 write_index : bool
1790 Write the index to Stata dataset.
1791 encoding : str
1792 Default is latin-1. Unicode is not supported.
1793 byteorder : str
1794 Can be ">", "<", "little", or "big". default is `sys.byteorder`.
1795 time_stamp : datetime
1796 A datetime to use as file creation date. Default is the current
1797 time.
1798 data_label : str
1799 A label for the data set. Must be 80 characters or smaller.
1800 variable_labels : dict
1801 Dictionary containing columns as keys and variable labels as
1802 values. Each label must be 80 characters or smaller.
1803
1804 .. versionadded:: 0.19.0
1805
1806 version : {114, 117}
1807 Version to use in the output dta file. Version 114 can be used
1808 read by Stata 10 and later. Version 117 can be read by Stata 13
1809 or later. Version 114 limits string variables to 244 characters or
1810 fewer while 117 allows strings with lengths up to 2,000,000
1811 characters.
1812
1813 .. versionadded:: 0.23.0
1814
1815 convert_strl : list, optional
1816 List of column names to convert to string columns to Stata StrL
1817 format. Only available if version is 117. Storing strings in the
1818 StrL format can produce smaller dta files if strings have more than
1819 8 characters and values are repeated.
1820
1821 .. versionadded:: 0.23.0
1822
1823 Raises
1824 ------
1825 NotImplementedError
1826 * If datetimes contain timezone information
1827 * Column dtype is not representable in Stata
1828 ValueError
1829 * Columns listed in convert_dates are neither datetime64[ns]
1830 or datetime.datetime
1831 * Column listed in convert_dates is not in DataFrame
1832 * Categorical label contains more than 32,000 characters
1833
1834 .. versionadded:: 0.19.0
1835
1836 See Also
1837 --------
1838 pandas.read_stata : Import Stata data files
1839 pandas.io.stata.StataWriter : low-level writer for Stata data files
1840 pandas.io.stata.StataWriter117 : low-level writer for version 117 files
1841
1842 Examples
1843 --------
1844 >>> data.to_stata('./data_file.dta')
1845
1846 Or with dates
1847
1848 >>> data.to_stata('./date_data_file.dta', {2 : 'tw'})
1849
1850 Alternatively you can create an instance of the StataWriter class
1851
1852 >>> writer = StataWriter('./data_file.dta', data)
1853 >>> writer.write_file()
1854
1855 With dates:
1856
1857 >>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'})
1858 >>> writer.write_file()
1859 """
1860 kwargs = {}
1861 if version not in (114, 117):
1862 raise ValueError('Only formats 114 and 117 supported.')
1863 if version == 114:
1864 if convert_strl is not None:
1865 raise ValueError('strl support is only available when using '
1866 'format 117')
1867 from pandas.io.stata import StataWriter as statawriter
1868 else:
1869 from pandas.io.stata import StataWriter117 as statawriter
1870 kwargs['convert_strl'] = convert_strl
1871
1872 writer = statawriter(fname, self, convert_dates=convert_dates,
1873 encoding=encoding, byteorder=byteorder,
1874 time_stamp=time_stamp, data_label=data_label,
1875 write_index=write_index,
1876 variable_labels=variable_labels, **kwargs)
1877 writer.write_file()
1878
1879 def to_feather(self, fname):
1880 """
1881 write out the binary feather-format for DataFrames
1882
1883 .. versionadded:: 0.20.0
1884
1885 Parameters
1886 ----------
1887 fname : str
1888 string file path
1889
1890 """
1891 from pandas.io.feather_format import to_feather
1892 to_feather(self, fname)
1893
1894 def to_parquet(self, fname, engine='auto', compression='snappy',
1895 **kwargs):
1896 """
1897 Write a DataFrame to the binary parquet format.
1898
1899 .. versionadded:: 0.21.0
1900
1901 This function writes the dataframe as a `parquet file
1902 <https://parquet.apache.org/>`_. You can choose different parquet
1903 backends, and have the option of compression. See
1904 :ref:`the user guide <io.parquet>` for more details.
1905
1906 Parameters
1907 ----------
1908 fname : str
1909 String file path.
1910 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
1911 Parquet library to use. If 'auto', then the option
1912 ``io.parquet.engine`` is used. The default ``io.parquet.engine``
1913 behavior is to try 'pyarrow', falling back to 'fastparquet' if
1914 'pyarrow' is unavailable.
1915 compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
1916 Name of the compression to use. Use ``None`` for no compression.
1917 **kwargs
1918 Additional arguments passed to the parquet library. See
1919 :ref:`pandas io <io.parquet>` for more details.
1920
1921 See Also
1922 --------
1923 read_parquet : Read a parquet file.
1924 DataFrame.to_csv : Write a csv file.
1925 DataFrame.to_sql : Write to a sql table.
1926 DataFrame.to_hdf : Write to hdf.
1927
1928 Notes
1929 -----
1930 This function requires either the `fastparquet
1931 <https://pypi.org/project/fastparquet>`_ or `pyarrow
1932 <https://arrow.apache.org/docs/python/>`_ library.
1933
1934 Examples
1935 --------
1936 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
1937 >>> df.to_parquet('df.parquet.gzip', compression='gzip')
1938 >>> pd.read_parquet('df.parquet.gzip')
1939 col1 col2
1940 0 1 3
1941 1 2 4
1942 """
1943 from pandas.io.parquet import to_parquet
1944 to_parquet(self, fname, engine,
1945 compression=compression, **kwargs)
1946
1947 @Substitution(header='Write out the column names. If a list of strings '
1948 'is given, it is assumed to be aliases for the '
1949 'column names')
1950 @Appender(fmt.docstring_to_string, indents=1)
1951 def to_string(self, buf=None, columns=None, col_space=None, header=True,
1952 index=True, na_rep='NaN', formatters=None, float_format=None,
1953 sparsify=None, index_names=True, justify=None,
1954 line_width=None, max_rows=None, max_cols=None,
1955 show_dimensions=False):
1956 """
1957 Render a DataFrame to a console-friendly tabular output.
1958 """
1959
1960 formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
1961 col_space=col_space, na_rep=na_rep,
1962 formatters=formatters,
1963 float_format=float_format,
1964 sparsify=sparsify, justify=justify,
1965 index_names=index_names,
1966 header=header, index=index,
1967 line_width=line_width,
1968 max_rows=max_rows,
1969 max_cols=max_cols,
1970 show_dimensions=show_dimensions)
1971 formatter.to_string()
1972
1973 if buf is None:
1974 result = formatter.buf.getvalue()
1975 return result
1976
1977 @Substitution(header='whether to print column labels, default True')
1978 @Appender(fmt.docstring_to_string, indents=1)
1979 def to_html(self, buf=None, columns=None, col_space=None, header=True,
1980 index=True, na_rep='NaN', formatters=None, float_format=None,
1981 sparsify=None, index_names=True, justify=None, bold_rows=True,
1982 classes=None, escape=True, max_rows=None, max_cols=None,
1983 show_dimensions=False, notebook=False, decimal='.',
1984 border=None, table_id=None):
1985 """
1986 Render a DataFrame as an HTML table.
1987
1988 `to_html`-specific options:
1989
1990 bold_rows : boolean, default True
1991 Make the row labels bold in the output
1992 classes : str or list or tuple, default None
1993 CSS class(es) to apply to the resulting html table
1994 escape : boolean, default True
1995 Convert the characters <, >, and & to HTML-safe sequences.
1996 max_rows : int, optional
1997 Maximum number of rows to show before truncating. If None, show
1998 all.
1999 max_cols : int, optional
2000 Maximum number of columns to show before truncating. If None, show
2001 all.
2002 decimal : string, default '.'
2003 Character recognized as decimal separator, e.g. ',' in Europe
2004
2005 .. versionadded:: 0.18.0
2006
2007 border : int
2008 A ``border=border`` attribute is included in the opening
2009 `<table>` tag. Default ``pd.options.html.border``.
2010
2011 .. versionadded:: 0.19.0
2012
2013 table_id : str, optional
2014 A css id is included in the opening `<table>` tag if specified.
2015
2016 .. versionadded:: 0.23.0
2017
2018 """
2019
2020 if (justify is not None and
2021 justify not in fmt._VALID_JUSTIFY_PARAMETERS):
2022 raise ValueError("Invalid value for justify parameter")
2023
2024 formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
2025 col_space=col_space, na_rep=na_rep,
2026 formatters=formatters,
2027 float_format=float_format,
2028 sparsify=sparsify, justify=justify,
2029 index_names=index_names,
2030 header=header, index=index,
2031 bold_rows=bold_rows, escape=escape,
2032 max_rows=max_rows,
2033 max_cols=max_cols,
2034 show_dimensions=show_dimensions,
2035 decimal=decimal, table_id=table_id)
2036 # TODO: a generic formatter wld b in DataFrameFormatter
2037 formatter.to_html(classes=classes, notebook=notebook, border=border)
2038
2039 if buf is None:
2040 return formatter.buf.getvalue()
2041
2042 def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
2043 null_counts=None):
2044 """
2045 Print a concise summary of a DataFrame.
2046
2047 This method prints information about a DataFrame including
2048 the index dtype and column dtypes, non-null values and memory usage.
2049
2050 Parameters
2051 ----------
2052 verbose : bool, optional
2053 Whether to print the full summary. By default, the setting in
2054 ``pandas.options.display.max_info_columns`` is followed.
2055 buf : writable buffer, defaults to sys.stdout
2056 Where to send the output. By default, the output is printed to
2057 sys.stdout. Pass a writable buffer if you need to further process
2058 the output.
2059 max_cols : int, optional
2060 When to switch from the verbose to the truncated output. If the
2061 DataFrame has more than `max_cols` columns, the truncated output
2062 is used. By default, the setting in
2063 ``pandas.options.display.max_info_columns`` is used.
2064 memory_usage : bool, str, optional
2065 Specifies whether total memory usage of the DataFrame
2066 elements (including the index) should be displayed. By default,
2067 this follows the ``pandas.options.display.memory_usage`` setting.
2068
2069 True always show memory usage. False never shows memory usage.
2070 A value of 'deep' is equivalent to "True with deep introspection".
2071 Memory usage is shown in human-readable units (base-2
2072 representation). Without deep introspection a memory estimation is
2073 made based in column dtype and number of rows assuming values
2074 consume the same memory amount for corresponding dtypes. With deep
2075 memory introspection, a real memory usage calculation is performed
2076 at the cost of computational resources.
2077 null_counts : bool, optional
2078 Whether to show the non-null counts. By default, this is shown
2079 only if the frame is smaller than
2080 ``pandas.options.display.max_info_rows`` and
2081 ``pandas.options.display.max_info_columns``. A value of True always
2082 shows the counts, and False never shows the counts.
2083
2084 Returns
2085 -------
2086 None
2087 This method prints a summary of a DataFrame and returns None.
2088
2089 See Also
2090 --------
2091 DataFrame.describe: Generate descriptive statistics of DataFrame
2092 columns.
2093 DataFrame.memory_usage: Memory usage of DataFrame columns.
2094
2095 Examples
2096 --------
2097 >>> int_values = [1, 2, 3, 4, 5]
2098 >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
2099 >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
2100 >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
2101 ... "float_col": float_values})
2102 >>> df
2103 int_col text_col float_col
2104 0 1 alpha 0.00
2105 1 2 beta 0.25
2106 2 3 gamma 0.50
2107 3 4 delta 0.75
2108 4 5 epsilon 1.00
2109
2110 Prints information of all columns:
2111
2112 >>> df.info(verbose=True)
2113 <class 'pandas.core.frame.DataFrame'>
2114 RangeIndex: 5 entries, 0 to 4
2115 Data columns (total 3 columns):
2116 int_col 5 non-null int64
2117 text_col 5 non-null object
2118 float_col 5 non-null float64
2119 dtypes: float64(1), int64(1), object(1)
2120 memory usage: 200.0+ bytes
2121
2122 Prints a summary of columns count and its dtypes but not per column
2123 information:
2124
2125 >>> df.info(verbose=False)
2126 <class 'pandas.core.frame.DataFrame'>
2127 RangeIndex: 5 entries, 0 to 4
2128 Columns: 3 entries, int_col to float_col
2129 dtypes: float64(1), int64(1), object(1)
2130 memory usage: 200.0+ bytes
2131
2132 Pipe output of DataFrame.info to buffer instead of sys.stdout, get
2133 buffer content and writes to a text file:
2134
2135 >>> import io
2136 >>> buffer = io.StringIO()
2137 >>> df.info(buf=buffer)
2138 >>> s = buffer.getvalue()
2139 >>> with open("df_info.txt", "w", encoding="utf-8") as f:
2140 ... f.write(s)
2141 260
2142
2143 The `memory_usage` parameter allows deep introspection mode, specially
2144 useful for big DataFrames and fine-tune memory optimization:
2145
2146 >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
2147 >>> df = pd.DataFrame({
2148 ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
2149 ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
2150 ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
2151 ... })
2152 >>> df.info()
2153 <class 'pandas.core.frame.DataFrame'>
2154 RangeIndex: 1000000 entries, 0 to 999999
2155 Data columns (total 3 columns):
2156 column_1 1000000 non-null object
2157 column_2 1000000 non-null object
2158 column_3 1000000 non-null object
2159 dtypes: object(3)
2160 memory usage: 22.9+ MB
2161
2162 >>> df.info(memory_usage='deep')
2163 <class 'pandas.core.frame.DataFrame'>
2164 RangeIndex: 1000000 entries, 0 to 999999
2165 Data columns (total 3 columns):
2166 column_1 1000000 non-null object
2167 column_2 1000000 non-null object
2168 column_3 1000000 non-null object
2169 dtypes: object(3)
2170 memory usage: 188.8 MB
2171 """
2172
2173 if buf is None: # pragma: no cover
2174 buf = sys.stdout
2175
2176 lines = []
2177
2178 lines.append(str(type(self)))
2179 lines.append(self.index._summary())
2180
2181 if len(self.columns) == 0:
2182 lines.append('Empty {name}'.format(name=type(self).__name__))
2183 fmt.buffer_put_lines(buf, lines)
2184 return
2185
2186 cols = self.columns
2187
2188 # hack
2189 if max_cols is None:
2190 max_cols = get_option('display.max_info_columns',
2191 len(self.columns) + 1)
2192
2193 max_rows = get_option('display.max_info_rows', len(self) + 1)
2194
2195 if null_counts is None:
2196 show_counts = ((len(self.columns) <= max_cols) and
2197 (len(self) < max_rows))
2198 else:
2199 show_counts = null_counts
2200 exceeds_info_cols = len(self.columns) > max_cols
2201
2202 def _verbose_repr():
2203 lines.append('Data columns (total %d columns):' %
2204 len(self.columns))
2205 space = max(len(pprint_thing(k)) for k in self.columns) + 4
2206 counts = None
2207
2208 tmpl = "{count}{dtype}"
2209 if show_counts:
2210 counts = self.count()
2211 if len(cols) != len(counts): # pragma: no cover
2212 raise AssertionError(
2213 'Columns must equal counts '
2214 '({cols:d} != {counts:d})'.format(
2215 cols=len(cols), counts=len(counts)))
2216 tmpl = "{count} non-null {dtype}"
2217
2218 dtypes = self.dtypes
2219 for i, col in enumerate(self.columns):
2220 dtype = dtypes.iloc[i]
2221 col = pprint_thing(col)
2222
2223 count = ""
2224 if show_counts:
2225 count = counts.iloc[i]
2226
2227 lines.append(_put_str(col, space) + tmpl.format(count=count,
2228 dtype=dtype))
2229
2230 def _non_verbose_repr():
2231 lines.append(self.columns._summary(name='Columns'))
2232
2233 def _sizeof_fmt(num, size_qualifier):
2234 # returns size in human readable format
2235 for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
2236 if num < 1024.0:
2237 return ("{num:3.1f}{size_q} "
2238 "{x}".format(num=num, size_q=size_qualifier, x=x))
2239 num /= 1024.0
2240 return "{num:3.1f}{size_q} {pb}".format(num=num,
2241 size_q=size_qualifier,
2242 pb='PB')
2243
2244 if verbose:
2245 _verbose_repr()
2246 elif verbose is False: # specifically set to False, not nesc None
2247 _non_verbose_repr()
2248 else:
2249 if exceeds_info_cols:
2250 _non_verbose_repr()
2251 else:
2252 _verbose_repr()
2253
2254 counts = self.get_dtype_counts()
2255 dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
2256 in sorted(compat.iteritems(counts))]
2257 lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
2258
2259 if memory_usage is None:
2260 memory_usage = get_option('display.memory_usage')
2261 if memory_usage:
2262 # append memory usage of df to display
2263 size_qualifier = ''
2264 if memory_usage == 'deep':
2265 deep = True
2266 else:
2267 # size_qualifier is just a best effort; not guaranteed to catch
2268 # all cases (e.g., it misses categorical data even with object
2269 # categories)
2270 deep = False
2271 if ('object' in counts or
2272 self.index._is_memory_usage_qualified()):
2273 size_qualifier = '+'
2274 mem_usage = self.memory_usage(index=True, deep=deep).sum()
2275 lines.append("memory usage: {mem}\n".format(
2276 mem=_sizeof_fmt(mem_usage, size_qualifier)))
2277
2278 fmt.buffer_put_lines(buf, lines)
2279
2280 def memory_usage(self, index=True, deep=False):
2281 """
2282 Return the memory usage of each column in bytes.
2283
2284 The memory usage can optionally include the contribution of
2285 the index and elements of `object` dtype.
2286
2287 This value is displayed in `DataFrame.info` by default. This can be
2288 suppressed by setting ``pandas.options.display.memory_usage`` to False.
2289
2290 Parameters
2291 ----------
2292 index : bool, default True
2293 Specifies whether to include the memory usage of the DataFrame's
2294 index in returned Series. If ``index=True`` the memory usage of the
2295 index the first item in the output.
2296 deep : bool, default False
2297 If True, introspect the data deeply by interrogating
2298 `object` dtypes for system-level memory consumption, and include
2299 it in the returned values.
2300
2301 Returns
2302 -------
2303 sizes : Series
2304 A Series whose index is the original column names and whose values
2305 is the memory usage of each column in bytes.
2306
2307 See Also
2308 --------
2309 numpy.ndarray.nbytes : Total bytes consumed by the elements of an
2310 ndarray.
2311 Series.memory_usage : Bytes consumed by a Series.
2312 pandas.Categorical : Memory-efficient array for string values with
2313 many repeated values.
2314 DataFrame.info : Concise summary of a DataFrame.
2315
2316 Examples
2317 --------
2318 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
2319 >>> data = dict([(t, np.ones(shape=5000).astype(t))
2320 ... for t in dtypes])
2321 >>> df = pd.DataFrame(data)
2322 >>> df.head()
2323 int64 float64 complex128 object bool
2324 0 1 1.0 (1+0j) 1 True
2325 1 1 1.0 (1+0j) 1 True
2326 2 1 1.0 (1+0j) 1 True
2327 3 1 1.0 (1+0j) 1 True
2328 4 1 1.0 (1+0j) 1 True
2329
2330 >>> df.memory_usage()
2331 Index 80
2332 int64 40000
2333 float64 40000
2334 complex128 80000
2335 object 40000
2336 bool 5000
2337 dtype: int64
2338
2339 >>> df.memory_usage(index=False)
2340 int64 40000
2341 float64 40000
2342 complex128 80000
2343 object 40000
2344 bool 5000
2345 dtype: int64
2346
2347 The memory footprint of `object` dtype columns is ignored by default:
2348
2349 >>> df.memory_usage(deep=True)
2350 Index 80
2351 int64 40000
2352 float64 40000
2353 complex128 80000
2354 object 160000
2355 bool 5000
2356 dtype: int64
2357
2358 Use a Categorical for efficient storage of an object-dtype column with
2359 many repeated values.
2360
2361 >>> df['object'].astype('category').memory_usage(deep=True)
2362 5168
2363 """
2364 result = Series([c.memory_usage(index=False, deep=deep)
2365 for col, c in self.iteritems()], index=self.columns)
2366 if index:
2367 result = Series(self.index.memory_usage(deep=deep),
2368 index=['Index']).append(result)
2369 return result
2370
2371 def transpose(self, *args, **kwargs):
2372 """
2373 Transpose index and columns.
2374
2375 Reflect the DataFrame over its main diagonal by writing rows as columns
2376 and vice-versa. The property :attr:`.T` is an accessor to the method
2377 :meth:`transpose`.
2378
2379 Parameters
2380 ----------
2381 copy : bool, default False
2382 If True, the underlying data is copied. Otherwise (default), no
2383 copy is made if possible.
2384 *args, **kwargs
2385 Additional keywords have no effect but might be accepted for
2386 compatibility with numpy.
2387
2388 Returns
2389 -------
2390 DataFrame
2391 The transposed DataFrame.
2392
2393 See Also
2394 --------
2395 numpy.transpose : Permute the dimensions of a given array.
2396
2397 Notes
2398 -----
2399 Transposing a DataFrame with mixed dtypes will result in a homogeneous
2400 DataFrame with the `object` dtype. In such a case, a copy of the data
2401 is always made.
2402
2403 Examples
2404 --------
2405 **Square DataFrame with homogeneous dtype**
2406
2407 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
2408 >>> df1 = pd.DataFrame(data=d1)
2409 >>> df1
2410 col1 col2
2411 0 1 3
2412 1 2 4
2413
2414 >>> df1_transposed = df1.T # or df1.transpose()
2415 >>> df1_transposed
2416 0 1
2417 col1 1 2
2418 col2 3 4
2419
2420 When the dtype is homogeneous in the original DataFrame, we get a
2421 transposed DataFrame with the same dtype:
2422
2423 >>> df1.dtypes
2424 col1 int64
2425 col2 int64
2426 dtype: object
2427 >>> df1_transposed.dtypes
2428 0 int64
2429 1 int64
2430 dtype: object
2431
2432 **Non-square DataFrame with mixed dtypes**
2433
2434 >>> d2 = {'name': ['Alice', 'Bob'],
2435 ... 'score': [9.5, 8],
2436 ... 'employed': [False, True],
2437 ... 'kids': [0, 0]}
2438 >>> df2 = pd.DataFrame(data=d2)
2439 >>> df2
2440 name score employed kids
2441 0 Alice 9.5 False 0
2442 1 Bob 8.0 True 0
2443
2444 >>> df2_transposed = df2.T # or df2.transpose()
2445 >>> df2_transposed
2446 0 1
2447 name Alice Bob
2448 score 9.5 8
2449 employed False True
2450 kids 0 0
2451
2452 When the DataFrame has mixed dtypes, we get a transposed DataFrame with
2453 the `object` dtype:
2454
2455 >>> df2.dtypes
2456 name object
2457 score float64
2458 employed bool
2459 kids int64
2460 dtype: object
2461 >>> df2_transposed.dtypes
2462 0 object
2463 1 object
2464 dtype: object
2465 """
2466 nv.validate_transpose(args, dict())
2467 return super(DataFrame, self).transpose(1, 0, **kwargs)
2468
2469 T = property(transpose)
2470
2471 # ----------------------------------------------------------------------
2472 # Picklability
2473
2474 # legacy pickle formats
2475 def _unpickle_frame_compat(self, state): # pragma: no cover
2476 if len(state) == 2: # pragma: no cover
2477 series, idx = state
2478 columns = sorted(series)
2479 else:
2480 series, cols, idx = state
2481 columns = com._unpickle_array(cols)
2482
2483 index = com._unpickle_array(idx)
2484 self._data = self._init_dict(series, index, columns, None)
2485
2486 def _unpickle_matrix_compat(self, state): # pragma: no cover
2487 # old unpickling
2488 (vals, idx, cols), object_state = state
2489
2490 index = com._unpickle_array(idx)
2491 dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols),
2492 copy=False)
2493
2494 if object_state is not None:
2495 ovals, _, ocols = object_state
2496 objects = DataFrame(ovals, index=index,
2497 columns=com._unpickle_array(ocols), copy=False)
2498
2499 dm = dm.join(objects)
2500
2501 self._data = dm._data
2502
2503 # ----------------------------------------------------------------------
2504 # Getting and setting elements
2505
2506 def get_value(self, index, col, takeable=False):
2507 """Quickly retrieve single value at passed column and index
2508
2509 .. deprecated:: 0.21.0
2510 Use .at[] or .iat[] accessors instead.
2511
2512 Parameters
2513 ----------
2514 index : row label
2515 col : column label
2516 takeable : interpret the index/col as indexers, default False
2517
2518 Returns
2519 -------
2520 value : scalar value
2521 """
2522
2523 warnings.warn("get_value is deprecated and will be removed "
2524 "in a future release. Please use "
2525 ".at[] or .iat[] accessors instead", FutureWarning,
2526 stacklevel=2)
2527 return self._get_value(index, col, takeable=takeable)
2528
2529 def _get_value(self, index, col, takeable=False):
2530
2531 if takeable:
2532 series = self._iget_item_cache(col)
2533 return com._maybe_box_datetimelike(series._values[index])
2534
2535 series = self._get_item_cache(col)
2536 engine = self.index._engine
2537
2538 try:
2539 return engine.get_value(series._values, index)
2540 except (TypeError, ValueError):
2541
2542 # we cannot handle direct indexing
2543 # use positional
2544 col = self.columns.get_loc(col)
2545 index = self.index.get_loc(index)
2546 return self._get_value(index, col, takeable=True)
2547 _get_value.__doc__ = get_value.__doc__
2548
2549 def set_value(self, index, col, value, takeable=False):
2550 """Put single value at passed column and index
2551
2552 .. deprecated:: 0.21.0
2553 Use .at[] or .iat[] accessors instead.
2554
2555 Parameters
2556 ----------
2557 index : row label
2558 col : column label
2559 value : scalar value
2560 takeable : interpret the index/col as indexers, default False
2561
2562 Returns
2563 -------
2564 frame : DataFrame
2565 If label pair is contained, will be reference to calling DataFrame,
2566 otherwise a new object
2567 """
2568 warnings.warn("set_value is deprecated and will be removed "
2569 "in a future release. Please use "
2570 ".at[] or .iat[] accessors instead", FutureWarning,
2571 stacklevel=2)
2572 return self._set_value(index, col, value, takeable=takeable)
2573
2574 def _set_value(self, index, col, value, takeable=False):
2575 try:
2576 if takeable is True:
2577 series = self._iget_item_cache(col)
2578 return series._set_value(index, value, takeable=True)
2579
2580 series = self._get_item_cache(col)
2581 engine = self.index._engine
2582 engine.set_value(series._values, index, value)
2583 return self
2584 except (KeyError, TypeError):
2585
2586 # set using a non-recursive method & reset the cache
2587 self.loc[index, col] = value
2588 self._item_cache.pop(col, None)
2589
2590 return self
2591 _set_value.__doc__ = set_value.__doc__
2592
2593 def _ixs(self, i, axis=0):
2594 """
2595 i : int, slice, or sequence of integers
2596 axis : int
2597 """
2598
2599 # irow
2600 if axis == 0:
2601 """
2602 Notes
2603 -----
2604 If slice passed, the resulting data will be a view
2605 """
2606
2607 if isinstance(i, slice):
2608 return self[i]
2609 else:
2610 label = self.index[i]
2611 if isinstance(label, Index):
2612 # a location index by definition
2613 result = self.take(i, axis=axis)
2614 copy = True
2615 else:
2616 new_values = self._data.fast_xs(i)
2617 if is_scalar(new_values):
2618 return new_values
2619
2620 # if we are a copy, mark as such
2621 copy = (isinstance(new_values, np.ndarray) and
2622 new_values.base is None)
2623 result = self._constructor_sliced(new_values,
2624 index=self.columns,
2625 name=self.index[i],
2626 dtype=new_values.dtype)
2627 result._set_is_copy(self, copy=copy)
2628 return result
2629
2630 # icol
2631 else:
2632 """
2633 Notes
2634 -----
2635 If slice passed, the resulting data will be a view
2636 """
2637
2638 label = self.columns[i]
2639 if isinstance(i, slice):
2640 # need to return view
2641 lab_slice = slice(label[0], label[-1])
2642 return self.loc[:, lab_slice]
2643 else:
2644 if isinstance(label, Index):
2645 return self._take(i, axis=1)
2646
2647 index_len = len(self.index)
2648
2649 # if the values returned are not the same length
2650 # as the index (iow a not found value), iget returns
2651 # a 0-len ndarray. This is effectively catching
2652 # a numpy error (as numpy should really raise)
2653 values = self._data.iget(i)
2654
2655 if index_len and not len(values):
2656 values = np.array([np.nan] * index_len, dtype=object)
2657 result = self._box_col_values(values, label)
2658
2659 # this is a cached value, mark it so
2660 result._set_as_cached(label, self)
2661
2662 return result
2663
2664 def __getitem__(self, key):
2665 key = com._apply_if_callable(key, self)
2666
2667 # shortcut if we are an actual column
2668 is_mi_columns = isinstance(self.columns, MultiIndex)
2669 try:
2670 if key in self.columns and not is_mi_columns:
2671 return self._getitem_column(key)
2672 except:
2673 pass
2674
2675 # see if we can slice the rows
2676 indexer = convert_to_index_sliceable(self, key)
2677 if indexer is not None:
2678 return self._getitem_slice(indexer)
2679
2680 if isinstance(key, (Series, np.ndarray, Index, list)):
2681 # either boolean or fancy integer index
2682 return self._getitem_array(key)
2683 elif isinstance(key, DataFrame):
2684 return self._getitem_frame(key)
2685 elif is_mi_columns:
2686 return self._getitem_multilevel(key)
2687 else:
2688 return self._getitem_column(key)
2689
2690 def _getitem_column(self, key):
2691 """ return the actual column """
2692
2693 # get column
2694 if self.columns.is_unique:
2695 return self._get_item_cache(key)
2696
2697 # duplicate columns & possible reduce dimensionality
2698 result = self._constructor(self._data.get(key))
2699 if result.columns.is_unique:
2700 result = result[key]
2701
2702 return result
2703
2704 def _getitem_slice(self, key):
2705 return self._slice(key, axis=0)
2706
2707 def _getitem_array(self, key):
2708 # also raises Exception if object array with NA values
2709 if com.is_bool_indexer(key):
2710 # warning here just in case -- previously __setitem__ was
2711 # reindexing but __getitem__ was not; it seems more reasonable to
2712 # go with the __setitem__ behavior since that is more consistent
2713 # with all other indexing behavior
2714 if isinstance(key, Series) and not key.index.equals(self.index):
2715 warnings.warn("Boolean Series key will be reindexed to match "
2716 "DataFrame index.", UserWarning, stacklevel=3)
2717 elif len(key) != len(self.index):
2718 raise ValueError('Item wrong length %d instead of %d.' %
2719 (len(key), len(self.index)))
2720 # check_bool_indexer will throw exception if Series key cannot
2721 # be reindexed to match DataFrame rows
2722 key = check_bool_indexer(self.index, key)
2723 indexer = key.nonzero()[0]
2724 return self._take(indexer, axis=0)
2725 else:
2726 indexer = self.loc._convert_to_indexer(key, axis=1)
2727 return self._take(indexer, axis=1)
2728
2729 def _getitem_multilevel(self, key):
2730 loc = self.columns.get_loc(key)
2731 if isinstance(loc, (slice, Series, np.ndarray, Index)):
2732 new_columns = self.columns[loc]
2733 result_columns = maybe_droplevels(new_columns, key)
2734 if self._is_mixed_type:
2735 result = self.reindex(columns=new_columns)
2736 result.columns = result_columns
2737 else:
2738 new_values = self.values[:, loc]
2739 result = self._constructor(new_values, index=self.index,
2740 columns=result_columns)
2741 result = result.__finalize__(self)
2742
2743 # If there is only one column being returned, and its name is
2744 # either an empty string, or a tuple with an empty string as its
2745 # first element, then treat the empty string as a placeholder
2746 # and return the column as if the user had provided that empty
2747 # string in the key. If the result is a Series, exclude the
2748 # implied empty string from its name.
2749 if len(result.columns) == 1:
2750 top = result.columns[0]
2751 if isinstance(top, tuple):
2752 top = top[0]
2753 if top == '':
2754 result = result['']
2755 if isinstance(result, Series):
2756 result = self._constructor_sliced(result,
2757 index=self.index,
2758 name=key)
2759
2760 result._set_is_copy(self)
2761 return result
2762 else:
2763 return self._get_item_cache(key)
2764
2765 def _getitem_frame(self, key):
2766 if key.values.size and not is_bool_dtype(key.values):
2767 raise ValueError('Must pass DataFrame with boolean values only')
2768 return self.where(key)
2769
2770 def query(self, expr, inplace=False, **kwargs):
2771 """Query the columns of a frame with a boolean expression.
2772
2773 Parameters
2774 ----------
2775 expr : string
2776 The query string to evaluate. You can refer to variables
2777 in the environment by prefixing them with an '@' character like
2778 ``@a + b``.
2779 inplace : bool
2780 Whether the query should modify the data in place or return
2781 a modified copy
2782
2783 .. versionadded:: 0.18.0
2784
2785 kwargs : dict
2786 See the documentation for :func:`pandas.eval` for complete details
2787 on the keyword arguments accepted by :meth:`DataFrame.query`.
2788
2789 Returns
2790 -------
2791 q : DataFrame
2792
2793 Notes
2794 -----
2795 The result of the evaluation of this expression is first passed to
2796 :attr:`DataFrame.loc` and if that fails because of a
2797 multidimensional key (e.g., a DataFrame) then the result will be passed
2798 to :meth:`DataFrame.__getitem__`.
2799
2800 This method uses the top-level :func:`pandas.eval` function to
2801 evaluate the passed query.
2802
2803 The :meth:`~pandas.DataFrame.query` method uses a slightly
2804 modified Python syntax by default. For example, the ``&`` and ``|``
2805 (bitwise) operators have the precedence of their boolean cousins,
2806 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
2807 however the semantics are different.
2808
2809 You can change the semantics of the expression by passing the keyword
2810 argument ``parser='python'``. This enforces the same semantics as
2811 evaluation in Python space. Likewise, you can pass ``engine='python'``
2812 to evaluate an expression using Python itself as a backend. This is not
2813 recommended as it is inefficient compared to using ``numexpr`` as the
2814 engine.
2815
2816 The :attr:`DataFrame.index` and
2817 :attr:`DataFrame.columns` attributes of the
2818 :class:`~pandas.DataFrame` instance are placed in the query namespace
2819 by default, which allows you to treat both the index and columns of the
2820 frame as a column in the frame.
2821 The identifier ``index`` is used for the frame index; you can also
2822 use the name of the index to identify it in a query. Please note that
2823 Python keywords may not be used as identifiers.
2824
2825 For further details and examples see the ``query`` documentation in
2826 :ref:`indexing <indexing.query>`.
2827
2828 See Also
2829 --------
2830 pandas.eval
2831 DataFrame.eval
2832
2833 Examples
2834 --------
2835 >>> from numpy.random import randn
2836 >>> from pandas import DataFrame
2837 >>> df = pd.DataFrame(randn(10, 2), columns=list('ab'))
2838 >>> df.query('a > b')
2839 >>> df[df.a > df.b] # same result as the previous expression
2840 """
2841 inplace = validate_bool_kwarg(inplace, 'inplace')
2842 if not isinstance(expr, compat.string_types):
2843 msg = "expr must be a string to be evaluated, {0} given"
2844 raise ValueError(msg.format(type(expr)))
2845 kwargs['level'] = kwargs.pop('level', 0) + 1
2846 kwargs['target'] = None
2847 res = self.eval(expr, **kwargs)
2848
2849 try:
2850 new_data = self.loc[res]
2851 except ValueError:
2852 # when res is multi-dimensional loc raises, but this is sometimes a
2853 # valid query
2854 new_data = self[res]
2855
2856 if inplace:
2857 self._update_inplace(new_data)
2858 else:
2859 return new_data
2860
2861 def eval(self, expr, inplace=False, **kwargs):
2862 """
2863 Evaluate a string describing operations on DataFrame columns.
2864
2865 Operates on columns only, not specific rows or elements. This allows
2866 `eval` to run arbitrary code, which can make you vulnerable to code
2867 injection if you pass user input to this function.
2868
2869 Parameters
2870 ----------
2871 expr : str
2872 The expression string to evaluate.
2873 inplace : bool, default False
2874 If the expression contains an assignment, whether to perform the
2875 operation inplace and mutate the existing DataFrame. Otherwise,
2876 a new DataFrame is returned.
2877
2878 .. versionadded:: 0.18.0.
2879 kwargs : dict
2880 See the documentation for :func:`~pandas.eval` for complete details
2881 on the keyword arguments accepted by
2882 :meth:`~pandas.DataFrame.query`.
2883
2884 Returns
2885 -------
2886 ndarray, scalar, or pandas object
2887 The result of the evaluation.
2888
2889 See Also
2890 --------
2891 DataFrame.query : Evaluates a boolean expression to query the columns
2892 of a frame.
2893 DataFrame.assign : Can evaluate an expression or function to create new
2894 values for a column.
2895 pandas.eval : Evaluate a Python expression as a string using various
2896 backends.
2897
2898 Notes
2899 -----
2900 For more details see the API documentation for :func:`~pandas.eval`.
2901 For detailed examples see :ref:`enhancing performance with eval
2902 <enhancingperf.eval>`.
2903
2904 Examples
2905 --------
2906 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
2907 >>> df
2908 A B
2909 0 1 10
2910 1 2 8
2911 2 3 6
2912 3 4 4
2913 4 5 2
2914 >>> df.eval('A + B')
2915 0 11
2916 1 10
2917 2 9
2918 3 8
2919 4 7
2920 dtype: int64
2921
2922 Assignment is allowed though by default the original DataFrame is not
2923 modified.
2924
2925 >>> df.eval('C = A + B')
2926 A B C
2927 0 1 10 11
2928 1 2 8 10
2929 2 3 6 9
2930 3 4 4 8
2931 4 5 2 7
2932 >>> df
2933 A B
2934 0 1 10
2935 1 2 8
2936 2 3 6
2937 3 4 4
2938 4 5 2
2939
2940 Use ``inplace=True`` to modify the original DataFrame.
2941
2942 >>> df.eval('C = A + B', inplace=True)
2943 >>> df
2944 A B C
2945 0 1 10 11
2946 1 2 8 10
2947 2 3 6 9
2948 3 4 4 8
2949 4 5 2 7
2950 """
2951 from pandas.core.computation.eval import eval as _eval
2952
2953 inplace = validate_bool_kwarg(inplace, 'inplace')
2954 resolvers = kwargs.pop('resolvers', None)
2955 kwargs['level'] = kwargs.pop('level', 0) + 1
2956 if resolvers is None:
2957 index_resolvers = self._get_index_resolvers()
2958 resolvers = dict(self.iteritems()), index_resolvers
2959 if 'target' not in kwargs:
2960 kwargs['target'] = self
2961 kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
2962 return _eval(expr, inplace=inplace, **kwargs)
2963
2964 def select_dtypes(self, include=None, exclude=None):
2965 """
2966 Return a subset of the DataFrame's columns based on the column dtypes.
2967
2968 Parameters
2969 ----------
2970 include, exclude : scalar or list-like
2971 A selection of dtypes or strings to be included/excluded. At least
2972 one of these parameters must be supplied.
2973
2974 Raises
2975 ------
2976 ValueError
2977 * If both of ``include`` and ``exclude`` are empty
2978 * If ``include`` and ``exclude`` have overlapping elements
2979 * If any kind of string dtype is passed in.
2980
2981 Returns
2982 -------
2983 subset : DataFrame
2984 The subset of the frame including the dtypes in ``include`` and
2985 excluding the dtypes in ``exclude``.
2986
2987 Notes
2988 -----
2989 * To select all *numeric* types, use ``np.number`` or ``'number'``
2990 * To select strings you must use the ``object`` dtype, but note that
2991 this will return *all* object dtype columns
2992 * See the `numpy dtype hierarchy
2993 <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
2994 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
2995 ``'datetime64'``
2996 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
2997 ``'timedelta64'``
2998 * To select Pandas categorical dtypes, use ``'category'``
2999 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
3000 0.20.0) or ``'datetime64[ns, tz]'``
3001
3002 Examples
3003 --------
3004 >>> df = pd.DataFrame({'a': [1, 2] * 3,
3005 ... 'b': [True, False] * 3,
3006 ... 'c': [1.0, 2.0] * 3})
3007 >>> df
3008 a b c
3009 0 1 True 1.0
3010 1 2 False 2.0
3011 2 1 True 1.0
3012 3 2 False 2.0
3013 4 1 True 1.0
3014 5 2 False 2.0
3015
3016 >>> df.select_dtypes(include='bool')
3017 b
3018 0 True
3019 1 False
3020 2 True
3021 3 False
3022 4 True
3023 5 False
3024
3025 >>> df.select_dtypes(include=['float64'])
3026 c
3027 0 1.0
3028 1 2.0
3029 2 1.0
3030 3 2.0
3031 4 1.0
3032 5 2.0
3033
3034 >>> df.select_dtypes(exclude=['int'])
3035 b c
3036 0 True 1.0
3037 1 False 2.0
3038 2 True 1.0
3039 3 False 2.0
3040 4 True 1.0
3041 5 False 2.0
3042 """
3043
3044 if not is_list_like(include):
3045 include = (include,) if include is not None else ()
3046 if not is_list_like(exclude):
3047 exclude = (exclude,) if exclude is not None else ()
3048
3049 selection = tuple(map(frozenset, (include, exclude)))
3050
3051 if not any(selection):
3052 raise ValueError('at least one of include or exclude must be '
3053 'nonempty')
3054
3055 # convert the myriad valid dtypes object to a single representation
3056 include, exclude = map(
3057 lambda x: frozenset(map(_get_dtype_from_object, x)), selection)
3058 for dtypes in (include, exclude):
3059 invalidate_string_dtypes(dtypes)
3060
3061 # can't both include AND exclude!
3062 if not include.isdisjoint(exclude):
3063 raise ValueError('include and exclude overlap on {inc_ex}'.format(
3064 inc_ex=(include & exclude)))
3065
3066 # empty include/exclude -> defaults to True
3067 # three cases (we've already raised if both are empty)
3068 # case 1: empty include, nonempty exclude
3069 # we have True, True, ... True for include, same for exclude
3070 # in the loop below we get the excluded
3071 # and when we call '&' below we get only the excluded
3072 # case 2: nonempty include, empty exclude
3073 # same as case 1, but with include
3074 # case 3: both nonempty
3075 # the "union" of the logic of case 1 and case 2:
3076 # we get the included and excluded, and return their logical and
3077 include_these = Series(not bool(include), index=self.columns)
3078 exclude_these = Series(not bool(exclude), index=self.columns)
3079
3080 def is_dtype_instance_mapper(idx, dtype):
3081 return idx, functools.partial(issubclass, dtype.type)
3082
3083 for idx, f in itertools.starmap(is_dtype_instance_mapper,
3084 enumerate(self.dtypes)):
3085 if include: # checks for the case of empty include or exclude
3086 include_these.iloc[idx] = any(map(f, include))
3087 if exclude:
3088 exclude_these.iloc[idx] = not any(map(f, exclude))
3089
3090 dtype_indexer = include_these & exclude_these
3091 return self.loc[com._get_info_slice(self, dtype_indexer)]
3092
3093 def _box_item_values(self, key, values):
3094 items = self.columns[self.columns.get_loc(key)]
3095 if values.ndim == 2:
3096 return self._constructor(values.T, columns=items, index=self.index)
3097 else:
3098 return self._box_col_values(values, items)
3099
3100 def _box_col_values(self, values, items):
3101 """ provide boxed values for a column """
3102 klass = _get_sliced_frame_result_type(values, self)
3103 return klass(values, index=self.index, name=items, fastpath=True)
3104
3105 def __setitem__(self, key, value):
3106 key = com._apply_if_callable(key, self)
3107
3108 # see if we can slice the rows
3109 indexer = convert_to_index_sliceable(self, key)
3110 if indexer is not None:
3111 return self._setitem_slice(indexer, value)
3112
3113 if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2:
3114 self._setitem_frame(key, value)
3115 elif isinstance(key, (Series, np.ndarray, list, Index)):
3116 self._setitem_array(key, value)
3117 else:
3118 # set column
3119 self._set_item(key, value)
3120
3121 def _setitem_slice(self, key, value):
3122 self._check_setitem_copy()
3123 self.loc._setitem_with_indexer(key, value)
3124
3125 def _setitem_array(self, key, value):
3126 # also raises Exception if object array with NA values
3127 if com.is_bool_indexer(key):
3128 if len(key) != len(self.index):
3129 raise ValueError('Item wrong length %d instead of %d!' %
3130 (len(key), len(self.index)))
3131 key = check_bool_indexer(self.index, key)
3132 indexer = key.nonzero()[0]
3133 self._check_setitem_copy()
3134 self.loc._setitem_with_indexer(indexer, value)
3135 else:
3136 if isinstance(value, DataFrame):
3137 if len(value.columns) != len(key):
3138 raise ValueError('Columns must be same length as key')
3139 for k1, k2 in zip(key, value.columns):
3140 self[k1] = value[k2]
3141 else:
3142 indexer = self.loc._convert_to_indexer(key, axis=1)
3143 self._check_setitem_copy()
3144 self.loc._setitem_with_indexer((slice(None), indexer), value)
3145
3146 def _setitem_frame(self, key, value):
3147 # support boolean setting with DataFrame input, e.g.
3148 # df[df > df2] = 0
3149 if isinstance(key, np.ndarray):
3150 if key.shape != self.shape:
3151 raise ValueError(
3152 'Array conditional must be same shape as self'
3153 )
3154 key = self._constructor(key, **self._construct_axes_dict())
3155
3156 if key.values.size and not is_bool_dtype(key.values):
3157 raise TypeError(
3158 'Must pass DataFrame or 2-d ndarray with boolean values only'
3159 )
3160
3161 self._check_inplace_setting(value)
3162 self._check_setitem_copy()
3163 self._where(-key, value, inplace=True)
3164
3165 def _ensure_valid_index(self, value):
3166 """
3167 ensure that if we don't have an index, that we can create one from the
3168 passed value
3169 """
3170 # GH5632, make sure that we are a Series convertible
3171 if not len(self.index) and is_list_like(value):
3172 try:
3173 value = Series(value)
3174 except:
3175 raise ValueError('Cannot set a frame with no defined index '
3176 'and a value that cannot be converted to a '
3177 'Series')
3178
3179 self._data = self._data.reindex_axis(value.index.copy(), axis=1,
3180 fill_value=np.nan)
3181
3182 def _set_item(self, key, value):
3183 """
3184 Add series to DataFrame in specified column.
3185
3186 If series is a numpy-array (not a Series/TimeSeries), it must be the
3187 same length as the DataFrames index or an error will be thrown.
3188
3189 Series/TimeSeries will be conformed to the DataFrames index to
3190 ensure homogeneity.
3191 """
3192
3193 self._ensure_valid_index(value)
3194 value = self._sanitize_column(key, value)
3195 NDFrame._set_item(self, key, value)
3196
3197 # check if we are modifying a copy
3198 # try to set first as we want an invalid
3199 # value exception to occur first
3200 if len(self):
3201 self._check_setitem_copy()
3202
3203 def insert(self, loc, column, value, allow_duplicates=False):
3204 """
3205 Insert column into DataFrame at specified location.
3206
3207 Raises a ValueError if `column` is already contained in the DataFrame,
3208 unless `allow_duplicates` is set to True.
3209
3210 Parameters
3211 ----------
3212 loc : int
3213 Insertion index. Must verify 0 <= loc <= len(columns)
3214 column : string, number, or hashable object
3215 label of the inserted column
3216 value : int, Series, or array-like
3217 allow_duplicates : bool, optional
3218 """
3219 self._ensure_valid_index(value)
3220 value = self._sanitize_column(column, value, broadcast=False)
3221 self._data.insert(loc, column, value,
3222 allow_duplicates=allow_duplicates)
3223
3224 def assign(self, **kwargs):
3225 r"""
3226 Assign new columns to a DataFrame, returning a new object
3227 (a copy) with the new columns added to the original ones.
3228 Existing columns that are re-assigned will be overwritten.
3229
3230 Parameters
3231 ----------
3232 kwargs : keyword, value pairs
3233 keywords are the column names. If the values are
3234 callable, they are computed on the DataFrame and
3235 assigned to the new columns. The callable must not
3236 change input DataFrame (though pandas doesn't check it).
3237 If the values are not callable, (e.g. a Series, scalar, or array),
3238 they are simply assigned.
3239
3240 Returns
3241 -------
3242 df : DataFrame
3243 A new DataFrame with the new columns in addition to
3244 all the existing columns.
3245
3246 Notes
3247 -----
3248 Assigning multiple columns within the same ``assign`` is possible.
3249 For Python 3.6 and above, later items in '\*\*kwargs' may refer to
3250 newly created or modified columns in 'df'; items are computed and
3251 assigned into 'df' in order. For Python 3.5 and below, the order of
3252 keyword arguments is not specified, you cannot refer to newly created
3253 or modified columns. All items are computed first, and then assigned
3254 in alphabetical order.
3255
3256 .. versionchanged :: 0.23.0
3257
3258 Keyword argument order is maintained for Python 3.6 and later.
3259
3260 Examples
3261 --------
3262 >>> df = pd.DataFrame({'A': range(1, 11), 'B': np.random.randn(10)})
3263
3264 Where the value is a callable, evaluated on `df`:
3265
3266 >>> df.assign(ln_A = lambda x: np.log(x.A))
3267 A B ln_A
3268 0 1 0.426905 0.000000
3269 1 2 -0.780949 0.693147
3270 2 3 -0.418711 1.098612
3271 3 4 -0.269708 1.386294
3272 4 5 -0.274002 1.609438
3273 5 6 -0.500792 1.791759
3274 6 7 1.649697 1.945910
3275 7 8 -1.495604 2.079442
3276 8 9 0.549296 2.197225
3277 9 10 -0.758542 2.302585
3278
3279 Where the value already exists and is inserted:
3280
3281 >>> newcol = np.log(df['A'])
3282 >>> df.assign(ln_A=newcol)
3283 A B ln_A
3284 0 1 0.426905 0.000000
3285 1 2 -0.780949 0.693147
3286 2 3 -0.418711 1.098612
3287 3 4 -0.269708 1.386294
3288 4 5 -0.274002 1.609438
3289 5 6 -0.500792 1.791759
3290 6 7 1.649697 1.945910
3291 7 8 -1.495604 2.079442
3292 8 9 0.549296 2.197225
3293 9 10 -0.758542 2.302585
3294
3295 Where the keyword arguments depend on each other
3296
3297 >>> df = pd.DataFrame({'A': [1, 2, 3]})
3298
3299 >>> df.assign(B=df.A, C=lambda x:x['A']+ x['B'])
3300 A B C
3301 0 1 1 2
3302 1 2 2 4
3303 2 3 3 6
3304 """
3305 data = self.copy()
3306
3307 # >= 3.6 preserve order of kwargs
3308 if PY36:
3309 for k, v in kwargs.items():
3310 data[k] = com._apply_if_callable(v, data)
3311 else:
3312 # <= 3.5: do all calculations first...
3313 results = OrderedDict()
3314 for k, v in kwargs.items():
3315 results[k] = com._apply_if_callable(v, data)
3316
3317 # <= 3.5 and earlier
3318 results = sorted(results.items())
3319 # ... and then assign
3320 for k, v in results:
3321 data[k] = v
3322 return data
3323
3324 def _sanitize_column(self, key, value, broadcast=True):
3325 """
3326 Ensures new columns (which go into the BlockManager as new blocks) are
3327 always copied and converted into an array.
3328
3329 Parameters
3330 ----------
3331 key : object
3332 value : scalar, Series, or array-like
3333 broadcast : bool, default True
3334 If ``key`` matches multiple duplicate column names in the
3335 DataFrame, this parameter indicates whether ``value`` should be
3336 tiled so that the returned array contains a (duplicated) column for
3337 each occurrence of the key. If False, ``value`` will not be tiled.
3338
3339 Returns
3340 -------
3341 sanitized_column : numpy-array
3342 """
3343
3344 def reindexer(value):
3345 # reindex if necessary
3346
3347 if value.index.equals(self.index) or not len(self.index):
3348 value = value._values.copy()
3349 else:
3350
3351 # GH 4107
3352 try:
3353 value = value.reindex(self.index)._values
3354 except Exception as e:
3355
3356 # duplicate axis
3357 if not value.index.is_unique:
3358 raise e
3359
3360 # other
3361 raise TypeError('incompatible index of inserted column '
3362 'with frame index')
3363 return value
3364
3365 if isinstance(value, Series):
3366 value = reindexer(value)
3367
3368 elif isinstance(value, DataFrame):
3369 # align right-hand-side columns if self.columns
3370 # is multi-index and self[key] is a sub-frame
3371 if isinstance(self.columns, MultiIndex) and key in self.columns:
3372 loc = self.columns.get_loc(key)
3373 if isinstance(loc, (slice, Series, np.ndarray, Index)):
3374 cols = maybe_droplevels(self.columns[loc], key)
3375 if len(cols) and not cols.equals(value.columns):
3376 value = value.reindex(cols, axis=1)
3377 # now align rows
3378 value = reindexer(value).T
3379
3380 elif isinstance(value, ExtensionArray):
3381 from pandas.core.series import _sanitize_index
3382 # Explicitly copy here, instead of in _sanitize_index,
3383 # as sanitize_index won't copy an EA, even with copy=True
3384 value = value.copy()
3385 value = _sanitize_index(value, self.index, copy=False)
3386
3387 elif isinstance(value, Index) or is_sequence(value):
3388 from pandas.core.series import _sanitize_index
3389
3390 # turn me into an ndarray
3391 value = _sanitize_index(value, self.index, copy=False)
3392 if not isinstance(value, (np.ndarray, Index)):
3393 if isinstance(value, list) and len(value) > 0:
3394 value = maybe_convert_platform(value)
3395 else:
3396 value = com._asarray_tuplesafe(value)
3397 elif value.ndim == 2:
3398 value = value.copy().T
3399 elif isinstance(value, Index):
3400 value = value.copy(deep=True)
3401 else:
3402 value = value.copy()
3403
3404 # possibly infer to datetimelike
3405 if is_object_dtype(value.dtype):
3406 value = maybe_infer_to_datetimelike(value)
3407
3408 else:
3409 # upcast the scalar
3410 value = cast_scalar_to_array(len(self.index), value)
3411 value = maybe_cast_to_datetime(value, value.dtype)
3412
3413 # return internal types directly
3414 if is_extension_type(value) or is_extension_array_dtype(value):
3415 return value
3416
3417 # broadcast across multiple columns if necessary
3418 if broadcast and key in self.columns and value.ndim == 1:
3419 if (not self.columns.is_unique or
3420 isinstance(self.columns, MultiIndex)):
3421 existing_piece = self[key]
3422 if isinstance(existing_piece, DataFrame):
3423 value = np.tile(value, (len(existing_piece.columns), 1))
3424
3425 return np.atleast_2d(np.asarray(value))
3426
3427 @property
3428 def _series(self):
3429 result = {}
3430 for idx, item in enumerate(self.columns):
3431 result[item] = Series(self._data.iget(idx), index=self.index,
3432 name=item)
3433 return result
3434
3435 def lookup(self, row_labels, col_labels):
3436 """Label-based "fancy indexing" function for DataFrame.
3437 Given equal-length arrays of row and column labels, return an
3438 array of the values corresponding to each (row, col) pair.
3439
3440 Parameters
3441 ----------
3442 row_labels : sequence
3443 The row labels to use for lookup
3444 col_labels : sequence
3445 The column labels to use for lookup
3446
3447 Notes
3448 -----
3449 Akin to::
3450
3451 result = []
3452 for row, col in zip(row_labels, col_labels):
3453 result.append(df.get_value(row, col))
3454
3455 Examples
3456 --------
3457 values : ndarray
3458 The found values
3459
3460 """
3461 n = len(row_labels)
3462 if n != len(col_labels):
3463 raise ValueError('Row labels must have same size as column labels')
3464
3465 thresh = 1000
3466 if not self._is_mixed_type or n > thresh:
3467 values = self.values
3468 ridx = self.index.get_indexer(row_labels)
3469 cidx = self.columns.get_indexer(col_labels)
3470 if (ridx == -1).any():
3471 raise KeyError('One or more row labels was not found')
3472 if (cidx == -1).any():
3473 raise KeyError('One or more column labels was not found')
3474 flat_index = ridx * len(self.columns) + cidx
3475 result = values.flat[flat_index]
3476 else:
3477 result = np.empty(n, dtype='O')
3478 for i, (r, c) in enumerate(zip(row_labels, col_labels)):
3479 result[i] = self._get_value(r, c)
3480
3481 if is_object_dtype(result):
3482 result = lib.maybe_convert_objects(result)
3483
3484 return result
3485
3486 # ----------------------------------------------------------------------
3487 # Reindexing and alignment
3488
3489 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
3490 copy):
3491 frame = self
3492
3493 columns = axes['columns']
3494 if columns is not None:
3495 frame = frame._reindex_columns(columns, method, copy, level,
3496 fill_value, limit, tolerance)
3497
3498 index = axes['index']
3499 if index is not None:
3500 frame = frame._reindex_index(index, method, copy, level,
3501 fill_value, limit, tolerance)
3502
3503 return frame
3504
3505 def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
3506 limit=None, tolerance=None):
3507 new_index, indexer = self.index.reindex(new_index, method=method,
3508 level=level, limit=limit,
3509 tolerance=tolerance)
3510 return self._reindex_with_indexers({0: [new_index, indexer]},
3511 copy=copy, fill_value=fill_value,
3512 allow_dups=False)
3513
3514 def _reindex_columns(self, new_columns, method, copy, level,
3515 fill_value=None, limit=None, tolerance=None):
3516 new_columns, indexer = self.columns.reindex(new_columns, method=method,
3517 level=level, limit=limit,
3518 tolerance=tolerance)
3519 return self._reindex_with_indexers({1: [new_columns, indexer]},
3520 copy=copy, fill_value=fill_value,
3521 allow_dups=False)
3522
3523 def _reindex_multi(self, axes, copy, fill_value):
3524 """ we are guaranteed non-Nones in the axes! """
3525
3526 new_index, row_indexer = self.index.reindex(axes['index'])
3527 new_columns, col_indexer = self.columns.reindex(axes['columns'])
3528
3529 if row_indexer is not None and col_indexer is not None:
3530 indexer = row_indexer, col_indexer
3531 new_values = algorithms.take_2d_multi(self.values, indexer,
3532 fill_value=fill_value)
3533 return self._constructor(new_values, index=new_index,
3534 columns=new_columns)
3535 else:
3536 return self._reindex_with_indexers({0: [new_index, row_indexer],
3537 1: [new_columns, col_indexer]},
3538 copy=copy,
3539 fill_value=fill_value)
3540
3541 @Appender(_shared_docs['align'] % _shared_doc_kwargs)
3542 def align(self, other, join='outer', axis=None, level=None, copy=True,
3543 fill_value=None, method=None, limit=None, fill_axis=0,
3544 broadcast_axis=None):
3545 return super(DataFrame, self).align(other, join=join, axis=axis,
3546 level=level, copy=copy,
3547 fill_value=fill_value,
3548 method=method, limit=limit,
3549 fill_axis=fill_axis,
3550 broadcast_axis=broadcast_axis)
3551
3552 @Appender(_shared_docs['reindex'] % _shared_doc_kwargs)
3553 @rewrite_axis_style_signature('labels', [('method', None),
3554 ('copy', True),
3555 ('level', None),
3556 ('fill_value', np.nan),
3557 ('limit', None),
3558 ('tolerance', None)])
3559 def reindex(self, *args, **kwargs):
3560 axes = validate_axis_style_args(self, args, kwargs, 'labels',
3561 'reindex')
3562 kwargs.update(axes)
3563 # Pop these, since the values are in `kwargs` under different names
3564 kwargs.pop('axis', None)
3565 kwargs.pop('labels', None)
3566 return super(DataFrame, self).reindex(**kwargs)
3567
3568 @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
3569 def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
3570 limit=None, fill_value=np.nan):
3571 return super(DataFrame,
3572 self).reindex_axis(labels=labels, axis=axis,
3573 method=method, level=level, copy=copy,
3574 limit=limit, fill_value=fill_value)
3575
3576 def drop(self, labels=None, axis=0, index=None, columns=None,
3577 level=None, inplace=False, errors='raise'):
3578 """
3579 Drop specified labels from rows or columns.
3580
3581 Remove rows or columns by specifying label names and corresponding
3582 axis, or by specifying directly index or column names. When using a
3583 multi-index, labels on different levels can be removed by specifying
3584 the level.
3585
3586 Parameters
3587 ----------
3588 labels : single label or list-like
3589 Index or column labels to drop.
3590 axis : {0 or 'index', 1 or 'columns'}, default 0
3591 Whether to drop labels from the index (0 or 'index') or
3592 columns (1 or 'columns').
3593 index, columns : single label or list-like
3594 Alternative to specifying axis (``labels, axis=1``
3595 is equivalent to ``columns=labels``).
3596
3597 .. versionadded:: 0.21.0
3598 level : int or level name, optional
3599 For MultiIndex, level from which the labels will be removed.
3600 inplace : bool, default False
3601 If True, do operation inplace and return None.
3602 errors : {'ignore', 'raise'}, default 'raise'
3603 If 'ignore', suppress error and only existing labels are
3604 dropped.
3605
3606 Returns
3607 -------
3608 dropped : pandas.DataFrame
3609
3610 See Also
3611 --------
3612 DataFrame.loc : Label-location based indexer for selection by label.
3613 DataFrame.dropna : Return DataFrame with labels on given axis omitted
3614 where (all or any) data are missing
3615 DataFrame.drop_duplicates : Return DataFrame with duplicate rows
3616 removed, optionally only considering certain columns
3617 Series.drop : Return Series with specified index labels removed.
3618
3619 Raises
3620 ------
3621 KeyError
3622 If none of the labels are found in the selected axis
3623
3624 Examples
3625 --------
3626 >>> df = pd.DataFrame(np.arange(12).reshape(3,4),
3627 ... columns=['A', 'B', 'C', 'D'])
3628 >>> df
3629 A B C D
3630 0 0 1 2 3
3631 1 4 5 6 7
3632 2 8 9 10 11
3633
3634 Drop columns
3635
3636 >>> df.drop(['B', 'C'], axis=1)
3637 A D
3638 0 0 3
3639 1 4 7
3640 2 8 11
3641
3642 >>> df.drop(columns=['B', 'C'])
3643 A D
3644 0 0 3
3645 1 4 7
3646 2 8 11
3647
3648 Drop a row by index
3649
3650 >>> df.drop([0, 1])
3651 A B C D
3652 2 8 9 10 11
3653
3654 Drop columns and/or rows of MultiIndex DataFrame
3655
3656 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
3657 ... ['speed', 'weight', 'length']],
3658 ... labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
3659 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
3660 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
3661 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
3662 ... [250, 150], [1.5, 0.8], [320, 250],
3663 ... [1, 0.8], [0.3,0.2]])
3664 >>> df
3665 big small
3666 lama speed 45.0 30.0
3667 weight 200.0 100.0
3668 length 1.5 1.0
3669 cow speed 30.0 20.0
3670 weight 250.0 150.0
3671 length 1.5 0.8
3672 falcon speed 320.0 250.0
3673 weight 1.0 0.8
3674 length 0.3 0.2
3675
3676 >>> df.drop(index='cow', columns='small')
3677 big
3678 lama speed 45.0
3679 weight 200.0
3680 length 1.5
3681 falcon speed 320.0
3682 weight 1.0
3683 length 0.3
3684
3685 >>> df.drop(index='length', level=1)
3686 big small
3687 lama speed 45.0 30.0
3688 weight 200.0 100.0
3689 cow speed 30.0 20.0
3690 weight 250.0 150.0
3691 falcon speed 320.0 250.0
3692 weight 1.0 0.8
3693 """
3694 return super(DataFrame, self).drop(labels=labels, axis=axis,
3695 index=index, columns=columns,
3696 level=level, inplace=inplace,
3697 errors=errors)
3698
3699 @rewrite_axis_style_signature('mapper', [('copy', True),
3700 ('inplace', False),
3701 ('level', None)])
3702 def rename(self, *args, **kwargs):
3703 """Alter axes labels.
3704
3705 Function / dict values must be unique (1-to-1). Labels not contained in
3706 a dict / Series will be left as-is. Extra labels listed don't throw an
3707 error.
3708
3709 See the :ref:`user guide <basics.rename>` for more.
3710
3711 Parameters
3712 ----------
3713 mapper, index, columns : dict-like or function, optional
3714 dict-like or functions transformations to apply to
3715 that axis' values. Use either ``mapper`` and ``axis`` to
3716 specify the axis to target with ``mapper``, or ``index`` and
3717 ``columns``.
3718 axis : int or str, optional
3719 Axis to target with ``mapper``. Can be either the axis name
3720 ('index', 'columns') or number (0, 1). The default is 'index'.
3721 copy : boolean, default True
3722 Also copy underlying data
3723 inplace : boolean, default False
3724 Whether to return a new DataFrame. If True then value of copy is
3725 ignored.
3726 level : int or level name, default None
3727 In case of a MultiIndex, only rename labels in the specified
3728 level.
3729
3730 Returns
3731 -------
3732 renamed : DataFrame
3733
3734 See Also
3735 --------
3736 pandas.DataFrame.rename_axis
3737
3738 Examples
3739 --------
3740
3741 ``DataFrame.rename`` supports two calling conventions
3742
3743 * ``(index=index_mapper, columns=columns_mapper, ...)``
3744 * ``(mapper, axis={'index', 'columns'}, ...)``
3745
3746 We *highly* recommend using keyword arguments to clarify your
3747 intent.
3748
3749 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
3750 >>> df.rename(index=str, columns={"A": "a", "B": "c"})
3751 a c
3752 0 1 4
3753 1 2 5
3754 2 3 6
3755
3756 >>> df.rename(index=str, columns={"A": "a", "C": "c"})
3757 a B
3758 0 1 4
3759 1 2 5
3760 2 3 6
3761
3762 Using axis-style parameters
3763
3764 >>> df.rename(str.lower, axis='columns')
3765 a b
3766 0 1 4
3767 1 2 5
3768 2 3 6
3769
3770 >>> df.rename({1: 2, 2: 4}, axis='index')
3771 A B
3772 0 1 4
3773 2 2 5
3774 4 3 6
3775 """
3776 axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename')
3777 kwargs.update(axes)
3778 # Pop these, since the values are in `kwargs` under different names
3779 kwargs.pop('axis', None)
3780 kwargs.pop('mapper', None)
3781 return super(DataFrame, self).rename(**kwargs)
3782
3783 @Substitution(**_shared_doc_kwargs)
3784 @Appender(NDFrame.fillna.__doc__)
3785 def fillna(self, value=None, method=None, axis=None, inplace=False,
3786 limit=None, downcast=None, **kwargs):
3787 return super(DataFrame,
3788 self).fillna(value=value, method=method, axis=axis,
3789 inplace=inplace, limit=limit,
3790 downcast=downcast, **kwargs)
3791
3792 @Appender(_shared_docs['replace'] % _shared_doc_kwargs)
3793 def replace(self, to_replace=None, value=None, inplace=False, limit=None,
3794 regex=False, method='pad'):
3795 return super(DataFrame, self).replace(to_replace=to_replace,
3796 value=value, inplace=inplace,
3797 limit=limit, regex=regex,
3798 method=method)
3799
3800 @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
3801 def shift(self, periods=1, freq=None, axis=0):
3802 return super(DataFrame, self).shift(periods=periods, freq=freq,
3803 axis=axis)
3804
3805 def set_index(self, keys, drop=True, append=False, inplace=False,
3806 verify_integrity=False):
3807 """
3808 Set the DataFrame index (row labels) using one or more existing
3809 columns. By default yields a new object.
3810
3811 Parameters
3812 ----------
3813 keys : column label or list of column labels / arrays
3814 drop : boolean, default True
3815 Delete columns to be used as the new index
3816 append : boolean, default False
3817 Whether to append columns to existing index
3818 inplace : boolean, default False
3819 Modify the DataFrame in place (do not create a new object)
3820 verify_integrity : boolean, default False
3821 Check the new index for duplicates. Otherwise defer the check until
3822 necessary. Setting to False will improve the performance of this
3823 method
3824
3825 Examples
3826 --------
3827 >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
3828 ... 'year': [2012, 2014, 2013, 2014],
3829 ... 'sale':[55, 40, 84, 31]})
3830 month sale year
3831 0 1 55 2012
3832 1 4 40 2014
3833 2 7 84 2013
3834 3 10 31 2014
3835
3836 Set the index to become the 'month' column:
3837
3838 >>> df.set_index('month')
3839 sale year
3840 month
3841 1 55 2012
3842 4 40 2014
3843 7 84 2013
3844 10 31 2014
3845
3846 Create a multi-index using columns 'year' and 'month':
3847
3848 >>> df.set_index(['year', 'month'])
3849 sale
3850 year month
3851 2012 1 55
3852 2014 4 40
3853 2013 7 84
3854 2014 10 31
3855
3856 Create a multi-index using a set of values and a column:
3857
3858 >>> df.set_index([[1, 2, 3, 4], 'year'])
3859 month sale
3860 year
3861 1 2012 1 55
3862 2 2014 4 40
3863 3 2013 7 84
3864 4 2014 10 31
3865
3866 Returns
3867 -------
3868 dataframe : DataFrame
3869 """
3870 inplace = validate_bool_kwarg(inplace, 'inplace')
3871 if not isinstance(keys, list):
3872 keys = [keys]
3873
3874 if inplace:
3875 frame = self
3876 else:
3877 frame = self.copy()
3878
3879 arrays = []
3880 names = []
3881 if append:
3882 names = [x for x in self.index.names]
3883 if isinstance(self.index, MultiIndex):
3884 for i in range(self.index.nlevels):
3885 arrays.append(self.index._get_level_values(i))
3886 else:
3887 arrays.append(self.index)
3888
3889 to_remove = []
3890 for col in keys:
3891 if isinstance(col, MultiIndex):
3892 # append all but the last column so we don't have to modify
3893 # the end of this loop
3894 for n in range(col.nlevels - 1):
3895 arrays.append(col._get_level_values(n))
3896
3897 level = col._get_level_values(col.nlevels - 1)
3898 names.extend(col.names)
3899 elif isinstance(col, Series):
3900 level = col._values
3901 names.append(col.name)
3902 elif isinstance(col, Index):
3903 level = col
3904 names.append(col.name)
3905 elif isinstance(col, (list, np.ndarray, Index)):
3906 level = col
3907 names.append(None)
3908 else:
3909 level = frame[col]._values
3910 names.append(col)
3911 if drop:
3912 to_remove.append(col)
3913 arrays.append(level)
3914
3915 index = _ensure_index_from_sequences(arrays, names)
3916
3917 if verify_integrity and not index.is_unique:
3918 duplicates = index[index.duplicated()].unique()
3919 raise ValueError('Index has duplicate keys: {dup}'.format(
3920 dup=duplicates))
3921
3922 for c in to_remove:
3923 del frame[c]
3924
3925 # clear up memory usage
3926 index._cleanup()
3927
3928 frame.index = index
3929
3930 if not inplace:
3931 return frame
3932
3933 def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
3934 col_fill=''):
3935 """
3936 For DataFrame with multi-level index, return new DataFrame with
3937 labeling information in the columns under the index names, defaulting
3938 to 'level_0', 'level_1', etc. if any are None. For a standard index,
3939 the index name will be used (if set), otherwise a default 'index' or
3940 'level_0' (if 'index' is already taken) will be used.
3941
3942 Parameters
3943 ----------
3944 level : int, str, tuple, or list, default None
3945 Only remove the given levels from the index. Removes all levels by
3946 default
3947 drop : boolean, default False
3948 Do not try to insert index into dataframe columns. This resets
3949 the index to the default integer index.
3950 inplace : boolean, default False
3951 Modify the DataFrame in place (do not create a new object)
3952 col_level : int or str, default 0
3953 If the columns have multiple levels, determines which level the
3954 labels are inserted into. By default it is inserted into the first
3955 level.
3956 col_fill : object, default ''
3957 If the columns have multiple levels, determines how the other
3958 levels are named. If None then the index name is repeated.
3959
3960 Returns
3961 -------
3962 resetted : DataFrame
3963
3964 Examples
3965 --------
3966 >>> df = pd.DataFrame([('bird', 389.0),
3967 ... ('bird', 24.0),
3968 ... ('mammal', 80.5),
3969 ... ('mammal', np.nan)],
3970 ... index=['falcon', 'parrot', 'lion', 'monkey'],
3971 ... columns=('class', 'max_speed'))
3972 >>> df
3973 class max_speed
3974 falcon bird 389.0
3975 parrot bird 24.0
3976 lion mammal 80.5
3977 monkey mammal NaN
3978
3979 When we reset the index, the old index is added as a column, and a
3980 new sequential index is used:
3981
3982 >>> df.reset_index()
3983 index class max_speed
3984 0 falcon bird 389.0
3985 1 parrot bird 24.0
3986 2 lion mammal 80.5
3987 3 monkey mammal NaN
3988
3989 We can use the `drop` parameter to avoid the old index being added as
3990 a column:
3991
3992 >>> df.reset_index(drop=True)
3993 class max_speed
3994 0 bird 389.0
3995 1 bird 24.0
3996 2 mammal 80.5
3997 3 mammal NaN
3998
3999 You can also use `reset_index` with `MultiIndex`.
4000
4001 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
4002 ... ('bird', 'parrot'),
4003 ... ('mammal', 'lion'),
4004 ... ('mammal', 'monkey')],
4005 ... names=['class', 'name'])
4006 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
4007 ... ('species', 'type')])
4008 >>> df = pd.DataFrame([(389.0, 'fly'),
4009 ... ( 24.0, 'fly'),
4010 ... ( 80.5, 'run'),
4011 ... (np.nan, 'jump')],
4012 ... index=index,
4013 ... columns=columns)
4014 >>> df
4015 speed species
4016 max type
4017 class name
4018 bird falcon 389.0 fly
4019 parrot 24.0 fly
4020 mammal lion 80.5 run
4021 monkey NaN jump
4022
4023 If the index has multiple levels, we can reset a subset of them:
4024
4025 >>> df.reset_index(level='class')
4026 class speed species
4027 max type
4028 name
4029 falcon bird 389.0 fly
4030 parrot bird 24.0 fly
4031 lion mammal 80.5 run
4032 monkey mammal NaN jump
4033
4034 If we are not dropping the index, by default, it is placed in the top
4035 level. We can place it in another level:
4036
4037 >>> df.reset_index(level='class', col_level=1)
4038 speed species
4039 class max type
4040 name
4041 falcon bird 389.0 fly
4042 parrot bird 24.0 fly
4043 lion mammal 80.5 run
4044 monkey mammal NaN jump
4045
4046 When the index is inserted under another level, we can specify under
4047 which one with the parameter `col_fill`:
4048
4049 >>> df.reset_index(level='class', col_level=1, col_fill='species')
4050 species speed species
4051 class max type
4052 name
4053 falcon bird 389.0 fly
4054 parrot bird 24.0 fly
4055 lion mammal 80.5 run
4056 monkey mammal NaN jump
4057
4058 If we specify a nonexistent level for `col_fill`, it is created:
4059
4060 >>> df.reset_index(level='class', col_level=1, col_fill='genus')
4061 genus speed species
4062 class max type
4063 name
4064 falcon bird 389.0 fly
4065 parrot bird 24.0 fly
4066 lion mammal 80.5 run
4067 monkey mammal NaN jump
4068 """
4069 inplace = validate_bool_kwarg(inplace, 'inplace')
4070 if inplace:
4071 new_obj = self
4072 else:
4073 new_obj = self.copy()
4074
4075 def _maybe_casted_values(index, labels=None):
4076 values = index._values
4077 if not isinstance(index, (PeriodIndex, DatetimeIndex)):
4078 if values.dtype == np.object_:
4079 values = lib.maybe_convert_objects(values)
4080
4081 # if we have the labels, extract the values with a mask
4082 if labels is not None:
4083 mask = labels == -1
4084
4085 # we can have situations where the whole mask is -1,
4086 # meaning there is nothing found in labels, so make all nan's
4087 if mask.all():
4088 values = np.empty(len(mask))
4089 values.fill(np.nan)
4090 else:
4091 values = values.take(labels)
4092 if mask.any():
4093 values, changed = maybe_upcast_putmask(
4094 values, mask, np.nan)
4095 return values
4096
4097 new_index = com._default_index(len(new_obj))
4098 if level is not None:
4099 if not isinstance(level, (tuple, list)):
4100 level = [level]
4101 level = [self.index._get_level_number(lev) for lev in level]
4102 if isinstance(self.index, MultiIndex):
4103 if len(level) < self.index.nlevels:
4104 new_index = self.index.droplevel(level)
4105
4106 if not drop:
4107 if isinstance(self.index, MultiIndex):
4108 names = [n if n is not None else ('level_%d' % i)
4109 for (i, n) in enumerate(self.index.names)]
4110 to_insert = lzip(self.index.levels, self.index.labels)
4111 else:
4112 default = 'index' if 'index' not in self else 'level_0'
4113 names = ([default] if self.index.name is None
4114 else [self.index.name])
4115 to_insert = ((self.index, None),)
4116
4117 multi_col = isinstance(self.columns, MultiIndex)
4118 for i, (lev, lab) in reversed(list(enumerate(to_insert))):
4119 if not (level is None or i in level):
4120 continue
4121 name = names[i]
4122 if multi_col:
4123 col_name = (list(name) if isinstance(name, tuple)
4124 else [name])
4125 if col_fill is None:
4126 if len(col_name) not in (1, self.columns.nlevels):
4127 raise ValueError("col_fill=None is incompatible "
4128 "with incomplete column name "
4129 "{}".format(name))
4130 col_fill = col_name[0]
4131
4132 lev_num = self.columns._get_level_number(col_level)
4133 name_lst = [col_fill] * lev_num + col_name
4134 missing = self.columns.nlevels - len(name_lst)
4135 name_lst += [col_fill] * missing
4136 name = tuple(name_lst)
4137 # to ndarray and maybe infer different dtype
4138 level_values = _maybe_casted_values(lev, lab)
4139 new_obj.insert(0, name, level_values)
4140
4141 new_obj.index = new_index
4142 if not inplace:
4143 return new_obj
4144
4145 # ----------------------------------------------------------------------
4146 # Reindex-based selection methods
4147
4148 @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
4149 def isna(self):
4150 return super(DataFrame, self).isna()
4151
4152 @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
4153 def isnull(self):
4154 return super(DataFrame, self).isnull()
4155
4156 @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
4157 def notna(self):
4158 return super(DataFrame, self).notna()
4159
4160 @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
4161 def notnull(self):
4162 return super(DataFrame, self).notnull()
4163
4164 def dropna(self, axis=0, how='any', thresh=None, subset=None,
4165 inplace=False):
4166 """
4167 Remove missing values.
4168
4169 See the :ref:`User Guide <missing_data>` for more on which values are
4170 considered missing, and how to work with missing data.
4171
4172 Parameters
4173 ----------
4174 axis : {0 or 'index', 1 or 'columns'}, default 0
4175 Determine if rows or columns which contain missing values are
4176 removed.
4177
4178 * 0, or 'index' : Drop rows which contain missing values.
4179 * 1, or 'columns' : Drop columns which contain missing value.
4180
4181 .. deprecated:: 0.23.0: Pass tuple or list to drop on multiple
4182 axes.
4183 how : {'any', 'all'}, default 'any'
4184 Determine if row or column is removed from DataFrame, when we have
4185 at least one NA or all NA.
4186
4187 * 'any' : If any NA values are present, drop that row or column.
4188 * 'all' : If all values are NA, drop that row or column.
4189 thresh : int, optional
4190 Require that many non-NA values.
4191 subset : array-like, optional
4192 Labels along other axis to consider, e.g. if you are dropping rows
4193 these would be a list of columns to include.
4194 inplace : bool, default False
4195 If True, do operation inplace and return None.
4196
4197 Returns
4198 -------
4199 DataFrame
4200 DataFrame with NA entries dropped from it.
4201
4202 See Also
4203 --------
4204 DataFrame.isna: Indicate missing values.
4205 DataFrame.notna : Indicate existing (non-missing) values.
4206 DataFrame.fillna : Replace missing values.
4207 Series.dropna : Drop missing values.
4208 Index.dropna : Drop missing indices.
4209
4210 Examples
4211 --------
4212 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
4213 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
4214 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
4215 ... pd.NaT]})
4216 >>> df
4217 name toy born
4218 0 Alfred NaN NaT
4219 1 Batman Batmobile 1940-04-25
4220 2 Catwoman Bullwhip NaT
4221
4222 Drop the rows where at least one element is missing.
4223
4224 >>> df.dropna()
4225 name toy born
4226 1 Batman Batmobile 1940-04-25
4227
4228 Drop the columns where at least one element is missing.
4229
4230 >>> df.dropna(axis='columns')
4231 name
4232 0 Alfred
4233 1 Batman
4234 2 Catwoman
4235
4236 Drop the rows where all elements are missing.
4237
4238 >>> df.dropna(how='all')
4239 name toy born
4240 0 Alfred NaN NaT
4241 1 Batman Batmobile 1940-04-25
4242 2 Catwoman Bullwhip NaT
4243
4244 Keep only the rows with at least 2 non-NA values.
4245
4246 >>> df.dropna(thresh=2)
4247 name toy born
4248 1 Batman Batmobile 1940-04-25
4249 2 Catwoman Bullwhip NaT
4250
4251 Define in which columns to look for missing values.
4252
4253 >>> df.dropna(subset=['name', 'born'])
4254 name toy born
4255 1 Batman Batmobile 1940-04-25
4256
4257 Keep the DataFrame with valid entries in the same variable.
4258
4259 >>> df.dropna(inplace=True)
4260 >>> df
4261 name toy born
4262 1 Batman Batmobile 1940-04-25
4263 """
4264 inplace = validate_bool_kwarg(inplace, 'inplace')
4265 if isinstance(axis, (tuple, list)):
4266 # GH20987
4267 msg = ("supplying multiple axes to axis is deprecated and "
4268 "will be removed in a future version.")
4269 warnings.warn(msg, FutureWarning, stacklevel=2)
4270
4271 result = self
4272 for ax in axis:
4273 result = result.dropna(how=how, thresh=thresh, subset=subset,
4274 axis=ax)
4275 else:
4276 axis = self._get_axis_number(axis)
4277 agg_axis = 1 - axis
4278
4279 agg_obj = self
4280 if subset is not None:
4281 ax = self._get_axis(agg_axis)
4282 indices = ax.get_indexer_for(subset)
4283 check = indices == -1
4284 if check.any():
4285 raise KeyError(list(np.compress(check, subset)))
4286 agg_obj = self.take(indices, axis=agg_axis)
4287
4288 count = agg_obj.count(axis=agg_axis)
4289
4290 if thresh is not None:
4291 mask = count >= thresh
4292 elif how == 'any':
4293 mask = count == len(agg_obj._get_axis(agg_axis))
4294 elif how == 'all':
4295 mask = count > 0
4296 else:
4297 if how is not None:
4298 raise ValueError('invalid how option: {h}'.format(h=how))
4299 else:
4300 raise TypeError('must specify how or thresh')
4301
4302 result = self._take(mask.nonzero()[0], axis=axis)
4303
4304 if inplace:
4305 self._update_inplace(result)
4306 else:
4307 return result
4308
4309 def drop_duplicates(self, subset=None, keep='first', inplace=False):
4310 """
4311 Return DataFrame with duplicate rows removed, optionally only
4312 considering certain columns
4313
4314 Parameters
4315 ----------
4316 subset : column label or sequence of labels, optional
4317 Only consider certain columns for identifying duplicates, by
4318 default use all of the columns
4319 keep : {'first', 'last', False}, default 'first'
4320 - ``first`` : Drop duplicates except for the first occurrence.
4321 - ``last`` : Drop duplicates except for the last occurrence.
4322 - False : Drop all duplicates.
4323 inplace : boolean, default False
4324 Whether to drop duplicates in place or to return a copy
4325
4326 Returns
4327 -------
4328 deduplicated : DataFrame
4329 """
4330 inplace = validate_bool_kwarg(inplace, 'inplace')
4331 duplicated = self.duplicated(subset, keep=keep)
4332
4333 if inplace:
4334 inds, = (-duplicated).nonzero()
4335 new_data = self._data.take(inds)
4336 self._update_inplace(new_data)
4337 else:
4338 return self[-duplicated]
4339
4340 def duplicated(self, subset=None, keep='first'):
4341 """
4342 Return boolean Series denoting duplicate rows, optionally only
4343 considering certain columns
4344
4345 Parameters
4346 ----------
4347 subset : column label or sequence of labels, optional
4348 Only consider certain columns for identifying duplicates, by
4349 default use all of the columns
4350 keep : {'first', 'last', False}, default 'first'
4351 - ``first`` : Mark duplicates as ``True`` except for the
4352 first occurrence.
4353 - ``last`` : Mark duplicates as ``True`` except for the
4354 last occurrence.
4355 - False : Mark all duplicates as ``True``.
4356
4357 Returns
4358 -------
4359 duplicated : Series
4360 """
4361 from pandas.core.sorting import get_group_index
4362 from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
4363
4364 def f(vals):
4365 labels, shape = algorithms.factorize(
4366 vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
4367 return labels.astype('i8', copy=False), len(shape)
4368
4369 if subset is None:
4370 subset = self.columns
4371 elif (not np.iterable(subset) or
4372 isinstance(subset, compat.string_types) or
4373 isinstance(subset, tuple) and subset in self.columns):
4374 subset = subset,
4375
4376 # Verify all columns in subset exist in the queried dataframe
4377 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
4378 # key that doesn't exist.
4379 diff = Index(subset).difference(self.columns)
4380 if not diff.empty:
4381 raise KeyError(diff)
4382
4383 vals = (col.values for name, col in self.iteritems()
4384 if name in subset)
4385 labels, shape = map(list, zip(*map(f, vals)))
4386
4387 ids = get_group_index(labels, shape, sort=False, xnull=False)
4388 return Series(duplicated_int64(ids, keep), index=self.index)
4389
4390 # ----------------------------------------------------------------------
4391 # Sorting
4392
4393 @Appender(_shared_docs['sort_values'] % _shared_doc_kwargs)
4394 def sort_values(self, by, axis=0, ascending=True, inplace=False,
4395 kind='quicksort', na_position='last'):
4396 inplace = validate_bool_kwarg(inplace, 'inplace')
4397 axis = self._get_axis_number(axis)
4398 stacklevel = 2 # Number of stack levels from df.sort_values
4399
4400 if not isinstance(by, list):
4401 by = [by]
4402 if is_sequence(ascending) and len(by) != len(ascending):
4403 raise ValueError('Length of ascending (%d) != length of by (%d)' %
4404 (len(ascending), len(by)))
4405 if len(by) > 1:
4406 from pandas.core.sorting import lexsort_indexer
4407
4408 keys = []
4409 for x in by:
4410 k = self._get_label_or_level_values(x, axis=axis,
4411 stacklevel=stacklevel)
4412 keys.append(k)
4413 indexer = lexsort_indexer(keys, orders=ascending,
4414 na_position=na_position)
4415 indexer = _ensure_platform_int(indexer)
4416 else:
4417 from pandas.core.sorting import nargsort
4418
4419 by = by[0]
4420 k = self._get_label_or_level_values(by, axis=axis,
4421 stacklevel=stacklevel)
4422
4423 if isinstance(ascending, (tuple, list)):
4424 ascending = ascending[0]
4425
4426 indexer = nargsort(k, kind=kind, ascending=ascending,
4427 na_position=na_position)
4428
4429 new_data = self._data.take(indexer,
4430 axis=self._get_block_manager_axis(axis),
4431 verify=False)
4432
4433 if inplace:
4434 return self._update_inplace(new_data)
4435 else:
4436 return self._constructor(new_data).__finalize__(self)
4437
4438 @Appender(_shared_docs['sort_index'] % _shared_doc_kwargs)
4439 def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
4440 kind='quicksort', na_position='last', sort_remaining=True,
4441 by=None):
4442
4443 # TODO: this can be combined with Series.sort_index impl as
4444 # almost identical
4445
4446 inplace = validate_bool_kwarg(inplace, 'inplace')
4447 # 10726
4448 if by is not None:
4449 warnings.warn("by argument to sort_index is deprecated, "
4450 "please use .sort_values(by=...)",
4451 FutureWarning, stacklevel=2)
4452 if level is not None:
4453 raise ValueError("unable to simultaneously sort by and level")
4454 return self.sort_values(by, axis=axis, ascending=ascending,
4455 inplace=inplace)
4456
4457 axis = self._get_axis_number(axis)
4458 labels = self._get_axis(axis)
4459
4460 # make sure that the axis is lexsorted to start
4461 # if not we need to reconstruct to get the correct indexer
4462 labels = labels._sort_levels_monotonic()
4463 if level is not None:
4464
4465 new_axis, indexer = labels.sortlevel(level, ascending=ascending,
4466 sort_remaining=sort_remaining)
4467
4468 elif isinstance(labels, MultiIndex):
4469 from pandas.core.sorting import lexsort_indexer
4470
4471 indexer = lexsort_indexer(labels._get_labels_for_sorting(),
4472 orders=ascending,
4473 na_position=na_position)
4474 else:
4475 from pandas.core.sorting import nargsort
4476
4477 # Check monotonic-ness before sort an index
4478 # GH11080
4479 if ((ascending and labels.is_monotonic_increasing) or
4480 (not ascending and labels.is_monotonic_decreasing)):
4481 if inplace:
4482 return
4483 else:
4484 return self.copy()
4485
4486 indexer = nargsort(labels, kind=kind, ascending=ascending,
4487 na_position=na_position)
4488
4489 baxis = self._get_block_manager_axis(axis)
4490 new_data = self._data.take(indexer,
4491 axis=baxis,
4492 verify=False)
4493
4494 # reconstruct axis if needed
4495 new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
4496
4497 if inplace:
4498 return self._update_inplace(new_data)
4499 else:
4500 return self._constructor(new_data).__finalize__(self)
4501
4502 def sortlevel(self, level=0, axis=0, ascending=True, inplace=False,
4503 sort_remaining=True):
4504 """Sort multilevel index by chosen axis and primary level. Data will be
4505 lexicographically sorted by the chosen level followed by the other
4506 levels (in order).
4507
4508 .. deprecated:: 0.20.0
4509 Use :meth:`DataFrame.sort_index`
4510
4511
4512 Parameters
4513 ----------
4514 level : int
4515 axis : {0 or 'index', 1 or 'columns'}, default 0
4516 ascending : boolean, default True
4517 inplace : boolean, default False
4518 Sort the DataFrame without creating a new instance
4519 sort_remaining : boolean, default True
4520 Sort by the other levels too.
4521
4522 Returns
4523 -------
4524 sorted : DataFrame
4525
4526 See Also
4527 --------
4528 DataFrame.sort_index(level=...)
4529
4530 """
4531 warnings.warn("sortlevel is deprecated, use sort_index(level= ...)",
4532 FutureWarning, stacklevel=2)
4533 return self.sort_index(level=level, axis=axis, ascending=ascending,
4534 inplace=inplace, sort_remaining=sort_remaining)
4535
4536 def nlargest(self, n, columns, keep='first'):
4537 """
4538 Return the first `n` rows ordered by `columns` in descending order.
4539
4540 Return the first `n` rows with the largest values in `columns`, in
4541 descending order. The columns that are not specified are returned as
4542 well, but not used for ordering.
4543
4544 This method is equivalent to
4545 ``df.sort_values(columns, ascending=False).head(n)``, but more
4546 performant.
4547
4548 Parameters
4549 ----------
4550 n : int
4551 Number of rows to return.
4552 columns : label or list of labels
4553 Column label(s) to order by.
4554 keep : {'first', 'last'}, default 'first'
4555 Where there are duplicate values:
4556
4557 - `first` : prioritize the first occurrence(s)
4558 - `last` : prioritize the last occurrence(s)
4559
4560 Returns
4561 -------
4562 DataFrame
4563 The first `n` rows ordered by the given columns in descending
4564 order.
4565
4566 See Also
4567 --------
4568 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
4569 ascending order.
4570 DataFrame.sort_values : Sort DataFrame by the values
4571 DataFrame.head : Return the first `n` rows without re-ordering.
4572
4573 Notes
4574 -----
4575 This function cannot be used with all column types. For example, when
4576 specifying columns with `object` or `category` dtypes, ``TypeError`` is
4577 raised.
4578
4579 Examples
4580 --------
4581 >>> df = pd.DataFrame({'a': [1, 10, 8, 10, -1],
4582 ... 'b': list('abdce'),
4583 ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
4584 >>> df
4585 a b c
4586 0 1 a 1.0
4587 1 10 b 2.0
4588 2 8 d NaN
4589 3 10 c 3.0
4590 4 -1 e 4.0
4591
4592 In the following example, we will use ``nlargest`` to select the three
4593 rows having the largest values in column "a".
4594
4595 >>> df.nlargest(3, 'a')
4596 a b c
4597 1 10 b 2.0
4598 3 10 c 3.0
4599 2 8 d NaN
4600
4601 When using ``keep='last'``, ties are resolved in reverse order:
4602
4603 >>> df.nlargest(3, 'a', keep='last')
4604 a b c
4605 3 10 c 3.0
4606 1 10 b 2.0
4607 2 8 d NaN
4608
4609 To order by the largest values in column "a" and then "c", we can
4610 specify multiple columns like in the next example.
4611
4612 >>> df.nlargest(3, ['a', 'c'])
4613 a b c
4614 3 10 c 3.0
4615 1 10 b 2.0
4616 2 8 d NaN
4617
4618 Attempting to use ``nlargest`` on non-numeric dtypes will raise a
4619 ``TypeError``:
4620
4621 >>> df.nlargest(3, 'b')
4622 Traceback (most recent call last):
4623 TypeError: Column 'b' has dtype object, cannot use method 'nlargest'
4624 """
4625 return algorithms.SelectNFrame(self,
4626 n=n,
4627 keep=keep,
4628 columns=columns).nlargest()
4629
4630 def nsmallest(self, n, columns, keep='first'):
4631 """Get the rows of a DataFrame sorted by the `n` smallest
4632 values of `columns`.
4633
4634 Parameters
4635 ----------
4636 n : int
4637 Number of items to retrieve
4638 columns : list or str
4639 Column name or names to order by
4640 keep : {'first', 'last'}, default 'first'
4641 Where there are duplicate values:
4642 - ``first`` : take the first occurrence.
4643 - ``last`` : take the last occurrence.
4644
4645 Returns
4646 -------
4647 DataFrame
4648
4649 Examples
4650 --------
4651 >>> df = pd.DataFrame({'a': [1, 10, 8, 11, -1],
4652 ... 'b': list('abdce'),
4653 ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
4654 >>> df.nsmallest(3, 'a')
4655 a b c
4656 4 -1 e 4
4657 0 1 a 1
4658 2 8 d NaN
4659 """
4660 return algorithms.SelectNFrame(self,
4661 n=n,
4662 keep=keep,
4663 columns=columns).nsmallest()
4664
4665 def swaplevel(self, i=-2, j=-1, axis=0):
4666 """
4667 Swap levels i and j in a MultiIndex on a particular axis
4668
4669 Parameters
4670 ----------
4671 i, j : int, string (can be mixed)
4672 Level of index to be swapped. Can pass level name as string.
4673
4674 Returns
4675 -------
4676 swapped : type of caller (new object)
4677
4678 .. versionchanged:: 0.18.1
4679
4680 The indexes ``i`` and ``j`` are now optional, and default to
4681 the two innermost levels of the index.
4682
4683 """
4684 result = self.copy()
4685
4686 axis = self._get_axis_number(axis)
4687 if axis == 0:
4688 result.index = result.index.swaplevel(i, j)
4689 else:
4690 result.columns = result.columns.swaplevel(i, j)
4691 return result
4692
4693 def reorder_levels(self, order, axis=0):
4694 """
4695 Rearrange index levels using input order.
4696 May not drop or duplicate levels
4697
4698 Parameters
4699 ----------
4700 order : list of int or list of str
4701 List representing new level order. Reference level by number
4702 (position) or by key (label).
4703 axis : int
4704 Where to reorder levels.
4705
4706 Returns
4707 -------
4708 type of caller (new object)
4709 """
4710 axis = self._get_axis_number(axis)
4711 if not isinstance(self._get_axis(axis),
4712 MultiIndex): # pragma: no cover
4713 raise TypeError('Can only reorder levels on a hierarchical axis.')
4714
4715 result = self.copy()
4716
4717 if axis == 0:
4718 result.index = result.index.reorder_levels(order)
4719 else:
4720 result.columns = result.columns.reorder_levels(order)
4721 return result
4722
4723 # ----------------------------------------------------------------------
4724 # Arithmetic / combination related
4725
4726 def _combine_frame(self, other, func, fill_value=None, level=None):
4727 this, other = self.align(other, join='outer', level=level, copy=False)
4728 new_index, new_columns = this.index, this.columns
4729
4730 def _arith_op(left, right):
4731 # for the mixed_type case where we iterate over columns,
4732 # _arith_op(left, right) is equivalent to
4733 # left._binop(right, func, fill_value=fill_value)
4734 left, right = ops.fill_binop(left, right, fill_value)
4735 return func(left, right)
4736
4737 if this._is_mixed_type or other._is_mixed_type:
4738 # iterate over columns
4739 if this.columns.is_unique:
4740 # unique columns
4741 result = {col: _arith_op(this[col], other[col])
4742 for col in this}
4743 result = self._constructor(result, index=new_index,
4744 columns=new_columns, copy=False)
4745 else:
4746 # non-unique columns
4747 result = {i: _arith_op(this.iloc[:, i], other.iloc[:, i])
4748 for i, col in enumerate(this.columns)}
4749 result = self._constructor(result, index=new_index, copy=False)
4750 result.columns = new_columns
4751 return result
4752
4753 else:
4754 result = _arith_op(this.values, other.values)
4755
4756 return self._constructor(result, index=new_index, columns=new_columns,
4757 copy=False)
4758
4759 def _combine_match_index(self, other, func, level=None):
4760 left, right = self.align(other, join='outer', axis=0, level=level,
4761 copy=False)
4762 new_data = func(left.values.T, right.values).T
4763 return self._constructor(new_data,
4764 index=left.index, columns=self.columns,
4765 copy=False)
4766
4767 def _combine_match_columns(self, other, func, level=None, try_cast=True):
4768 left, right = self.align(other, join='outer', axis=1, level=level,
4769 copy=False)
4770
4771 new_data = left._data.eval(func=func, other=right,
4772 axes=[left.columns, self.index],
4773 try_cast=try_cast)
4774 return self._constructor(new_data)
4775
4776 def _combine_const(self, other, func, errors='raise', try_cast=True):
4777 new_data = self._data.eval(func=func, other=other,
4778 errors=errors,
4779 try_cast=try_cast)
4780 return self._constructor(new_data)
4781
4782 def _compare_frame(self, other, func, str_rep):
4783 # compare_frame assumes self._indexed_same(other)
4784
4785 import pandas.core.computation.expressions as expressions
4786 # unique
4787 if self.columns.is_unique:
4788
4789 def _compare(a, b):
4790 return {col: func(a[col], b[col]) for col in a.columns}
4791
4792 new_data = expressions.evaluate(_compare, str_rep, self, other)
4793 return self._constructor(data=new_data, index=self.index,
4794 columns=self.columns, copy=False)
4795 # non-unique
4796 else:
4797
4798 def _compare(a, b):
4799 return {i: func(a.iloc[:, i], b.iloc[:, i])
4800 for i, col in enumerate(a.columns)}
4801
4802 new_data = expressions.evaluate(_compare, str_rep, self, other)
4803 result = self._constructor(data=new_data, index=self.index,
4804 copy=False)
4805 result.columns = self.columns
4806 return result
4807
4808 def combine(self, other, func, fill_value=None, overwrite=True):
4809 """
4810 Add two DataFrame objects and do not propagate NaN values, so if for a
4811 (column, time) one frame is missing a value, it will default to the
4812 other frame's value (which might be NaN as well)
4813
4814 Parameters
4815 ----------
4816 other : DataFrame
4817 func : function
4818 Function that takes two series as inputs and return a Series or a
4819 scalar
4820 fill_value : scalar value
4821 overwrite : boolean, default True
4822 If True then overwrite values for common keys in the calling frame
4823
4824 Returns
4825 -------
4826 result : DataFrame
4827
4828 Examples
4829 --------
4830 >>> df1 = DataFrame({'A': [0, 0], 'B': [4, 4]})
4831 >>> df2 = DataFrame({'A': [1, 1], 'B': [3, 3]})
4832 >>> df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2)
4833 A B
4834 0 0 3
4835 1 0 3
4836
4837 See Also
4838 --------
4839 DataFrame.combine_first : Combine two DataFrame objects and default to
4840 non-null values in frame calling the method
4841 """
4842 other_idxlen = len(other.index) # save for compare
4843
4844 this, other = self.align(other, copy=False)
4845 new_index = this.index
4846
4847 if other.empty and len(new_index) == len(self.index):
4848 return self.copy()
4849
4850 if self.empty and len(other) == other_idxlen:
4851 return other.copy()
4852
4853 # sorts if possible
4854 new_columns = this.columns.union(other.columns)
4855 do_fill = fill_value is not None
4856
4857 result = {}
4858 for col in new_columns:
4859 series = this[col]
4860 otherSeries = other[col]
4861
4862 this_dtype = series.dtype
4863 other_dtype = otherSeries.dtype
4864
4865 this_mask = isna(series)
4866 other_mask = isna(otherSeries)
4867
4868 # don't overwrite columns unecessarily
4869 # DO propagate if this column is not in the intersection
4870 if not overwrite and other_mask.all():
4871 result[col] = this[col].copy()
4872 continue
4873
4874 if do_fill:
4875 series = series.copy()
4876 otherSeries = otherSeries.copy()
4877 series[this_mask] = fill_value
4878 otherSeries[other_mask] = fill_value
4879
4880 # if we have different dtypes, possibly promote
4881 new_dtype = this_dtype
4882 if not is_dtype_equal(this_dtype, other_dtype):
4883 new_dtype = find_common_type([this_dtype, other_dtype])
4884 if not is_dtype_equal(this_dtype, new_dtype):
4885 series = series.astype(new_dtype)
4886 if not is_dtype_equal(other_dtype, new_dtype):
4887 otherSeries = otherSeries.astype(new_dtype)
4888
4889 # see if we need to be represented as i8 (datetimelike)
4890 # try to keep us at this dtype
4891 needs_i8_conversion_i = needs_i8_conversion(new_dtype)
4892 if needs_i8_conversion_i:
4893 arr = func(series, otherSeries, True)
4894 else:
4895 arr = func(series, otherSeries)
4896
4897 arr = maybe_downcast_to_dtype(arr, this_dtype)
4898
4899 result[col] = arr
4900
4901 # convert_objects just in case
4902 return self._constructor(result, index=new_index,
4903 columns=new_columns)._convert(datetime=True,
4904 copy=False)
4905
4906 def combine_first(self, other):
4907 """
4908 Combine two DataFrame objects and default to non-null values in frame
4909 calling the method. Result index columns will be the union of the
4910 respective indexes and columns
4911
4912 Parameters
4913 ----------
4914 other : DataFrame
4915
4916 Returns
4917 -------
4918 combined : DataFrame
4919
4920 Examples
4921 --------
4922 df1's values prioritized, use values from df2 to fill holes:
4923
4924 >>> df1 = pd.DataFrame([[1, np.nan]])
4925 >>> df2 = pd.DataFrame([[3, 4]])
4926 >>> df1.combine_first(df2)
4927 0 1
4928 0 1 4.0
4929
4930 See Also
4931 --------
4932 DataFrame.combine : Perform series-wise operation on two DataFrames
4933 using a given function
4934 """
4935 import pandas.core.computation.expressions as expressions
4936
4937 def combiner(x, y, needs_i8_conversion=False):
4938 x_values = x.values if hasattr(x, 'values') else x
4939 y_values = y.values if hasattr(y, 'values') else y
4940 if needs_i8_conversion:
4941 mask = isna(x)
4942 x_values = x_values.view('i8')
4943 y_values = y_values.view('i8')
4944 else:
4945 mask = isna(x_values)
4946
4947 return expressions.where(mask, y_values, x_values)
4948
4949 return self.combine(other, combiner, overwrite=False)
4950
4951 def update(self, other, join='left', overwrite=True, filter_func=None,
4952 raise_conflict=False):
4953 """
4954 Modify in place using non-NA values from another DataFrame.
4955
4956 Aligns on indices. There is no return value.
4957
4958 Parameters
4959 ----------
4960 other : DataFrame, or object coercible into a DataFrame
4961 Should have at least one matching index/column label
4962 with the original DataFrame. If a Series is passed,
4963 its name attribute must be set, and that will be
4964 used as the column name to align with the original DataFrame.
4965 join : {'left'}, default 'left'
4966 Only left join is implemented, keeping the index and columns of the
4967 original object.
4968 overwrite : bool, default True
4969 How to handle non-NA values for overlapping keys:
4970
4971 * True: overwrite original DataFrame's values
4972 with values from `other`.
4973 * False: only update values that are NA in
4974 the original DataFrame.
4975
4976 filter_func : callable(1d-array) -> boolean 1d-array, optional
4977 Can choose to replace values other than NA. Return True for values
4978 that should be updated.
4979 raise_conflict : bool, default False
4980 If True, will raise a ValueError if the DataFrame and `other`
4981 both contain non-NA data in the same place.
4982
4983 Raises
4984 ------
4985 ValueError
4986 When `raise_conflict` is True and there's overlapping non-NA data.
4987
4988 See Also
4989 --------
4990 dict.update : Similar method for dictionaries.
4991 DataFrame.merge : For column(s)-on-columns(s) operations.
4992
4993 Examples
4994 --------
4995 >>> df = pd.DataFrame({'A': [1, 2, 3],
4996 ... 'B': [400, 500, 600]})
4997 >>> new_df = pd.DataFrame({'B': [4, 5, 6],
4998 ... 'C': [7, 8, 9]})
4999 >>> df.update(new_df)
5000 >>> df
5001 A B
5002 0 1 4
5003 1 2 5
5004 2 3 6
5005
5006 The DataFrame's length does not increase as a result of the update,
5007 only values at matching index/column labels are updated.
5008
5009 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5010 ... 'B': ['x', 'y', 'z']})
5011 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
5012 >>> df.update(new_df)
5013 >>> df
5014 A B
5015 0 a d
5016 1 b e
5017 2 c f
5018
5019 For Series, it's name attribute must be set.
5020
5021 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5022 ... 'B': ['x', 'y', 'z']})
5023 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
5024 >>> df.update(new_column)
5025 >>> df
5026 A B
5027 0 a d
5028 1 b y
5029 2 c e
5030 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5031 ... 'B': ['x', 'y', 'z']})
5032 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
5033 >>> df.update(new_df)
5034 >>> df
5035 A B
5036 0 a x
5037 1 b d
5038 2 c e
5039
5040 If `other` contains NaNs the corresponding values are not updated
5041 in the original dataframe.
5042
5043 >>> df = pd.DataFrame({'A': [1, 2, 3],
5044 ... 'B': [400, 500, 600]})
5045 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
5046 >>> df.update(new_df)
5047 >>> df
5048 A B
5049 0 1 4.0
5050 1 2 500.0
5051 2 3 6.0
5052 """
5053 import pandas.core.computation.expressions as expressions
5054 # TODO: Support other joins
5055 if join != 'left': # pragma: no cover
5056 raise NotImplementedError("Only left join is supported")
5057
5058 if not isinstance(other, DataFrame):
5059 other = DataFrame(other)
5060
5061 other = other.reindex_like(self)
5062
5063 for col in self.columns:
5064 this = self[col].values
5065 that = other[col].values
5066 if filter_func is not None:
5067 with np.errstate(all='ignore'):
5068 mask = ~filter_func(this) | isna(that)
5069 else:
5070 if raise_conflict:
5071 mask_this = notna(that)
5072 mask_that = notna(this)
5073 if any(mask_this & mask_that):
5074 raise ValueError("Data overlaps.")
5075
5076 if overwrite:
5077 mask = isna(that)
5078 else:
5079 mask = notna(this)
5080
5081 # don't overwrite columns unecessarily
5082 if mask.all():
5083 continue
5084
5085 self[col] = expressions.where(mask, this, that)
5086
5087 # ----------------------------------------------------------------------
5088 # Data reshaping
5089
5090 def pivot(self, index=None, columns=None, values=None):
5091 """
5092 Return reshaped DataFrame organized by given index / column values.
5093
5094 Reshape data (produce a "pivot" table) based on column values. Uses
5095 unique values from specified `index` / `columns` to form axes of the
5096 resulting DataFrame. This function does not support data
5097 aggregation, multiple values will result in a MultiIndex in the
5098 columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
5099
5100 Parameters
5101 ----------
5102 index : string or object, optional
5103 Column to use to make new frame's index. If None, uses
5104 existing index.
5105 columns : string or object
5106 Column to use to make new frame's columns.
5107 values : string, object or a list of the previous, optional
5108 Column(s) to use for populating new frame's values. If not
5109 specified, all remaining columns will be used and the result will
5110 have hierarchically indexed columns.
5111
5112 .. versionchanged :: 0.23.0
5113 Also accept list of column names.
5114
5115 Returns
5116 -------
5117 DataFrame
5118 Returns reshaped DataFrame.
5119
5120 Raises
5121 ------
5122 ValueError:
5123 When there are any `index`, `columns` combinations with multiple
5124 values. `DataFrame.pivot_table` when you need to aggregate.
5125
5126 See Also
5127 --------
5128 DataFrame.pivot_table : generalization of pivot that can handle
5129 duplicate values for one index/column pair.
5130 DataFrame.unstack : pivot based on the index values instead of a
5131 column.
5132
5133 Notes
5134 -----
5135 For finer-tuned control, see hierarchical indexing documentation along
5136 with the related stack/unstack methods.
5137
5138 Examples
5139 --------
5140 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
5141 ... 'two'],
5142 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
5143 ... 'baz': [1, 2, 3, 4, 5, 6],
5144 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
5145 >>> df
5146 foo bar baz zoo
5147 0 one A 1 x
5148 1 one B 2 y
5149 2 one C 3 z
5150 3 two A 4 q
5151 4 two B 5 w
5152 5 two C 6 t
5153
5154 >>> df.pivot(index='foo', columns='bar', values='baz')
5155 bar A B C
5156 foo
5157 one 1 2 3
5158 two 4 5 6
5159
5160 >>> df.pivot(index='foo', columns='bar')['baz']
5161 bar A B C
5162 foo
5163 one 1 2 3
5164 two 4 5 6
5165
5166 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
5167 baz zoo
5168 bar A B C A B C
5169 foo
5170 one 1 2 3 x y z
5171 two 4 5 6 q w t
5172
5173 A ValueError is raised if there are any duplicates.
5174
5175 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
5176 ... "bar": ['A', 'A', 'B', 'C'],
5177 ... "baz": [1, 2, 3, 4]})
5178 >>> df
5179 foo bar baz
5180 0 one A 1
5181 1 one A 2
5182 2 two B 3
5183 3 two C 4
5184
5185 Notice that the first two rows are the same for our `index`
5186 and `columns` arguments.
5187
5188 >>> df.pivot(index='foo', columns='bar', values='baz')
5189 Traceback (most recent call last):
5190 ...
5191 ValueError: Index contains duplicate entries, cannot reshape
5192 """
5193 from pandas.core.reshape.reshape import pivot
5194 return pivot(self, index=index, columns=columns, values=values)
5195
5196 _shared_docs['pivot_table'] = """
5197 Create a spreadsheet-style pivot table as a DataFrame. The levels in
5198 the pivot table will be stored in MultiIndex objects (hierarchical
5199 indexes) on the index and columns of the result DataFrame
5200
5201 Parameters
5202 ----------%s
5203 values : column to aggregate, optional
5204 index : column, Grouper, array, or list of the previous
5205 If an array is passed, it must be the same length as the data. The
5206 list can contain any of the other types (except list).
5207 Keys to group by on the pivot table index. If an array is passed,
5208 it is being used as the same manner as column values.
5209 columns : column, Grouper, array, or list of the previous
5210 If an array is passed, it must be the same length as the data. The
5211 list can contain any of the other types (except list).
5212 Keys to group by on the pivot table column. If an array is passed,
5213 it is being used as the same manner as column values.
5214 aggfunc : function, list of functions, dict, default numpy.mean
5215 If list of functions passed, the resulting pivot table will have
5216 hierarchical columns whose top level are the function names
5217 (inferred from the function objects themselves)
5218 If dict is passed, the key is column to aggregate and value
5219 is function or list of functions
5220 fill_value : scalar, default None
5221 Value to replace missing values with
5222 margins : boolean, default False
5223 Add all row / columns (e.g. for subtotal / grand totals)
5224 dropna : boolean, default True
5225 Do not include columns whose entries are all NaN
5226 margins_name : string, default 'All'
5227 Name of the row / column that will contain the totals
5228 when margins is True.
5229
5230 Examples
5231 --------
5232 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
5233 ... "bar", "bar", "bar", "bar"],
5234 ... "B": ["one", "one", "one", "two", "two",
5235 ... "one", "one", "two", "two"],
5236 ... "C": ["small", "large", "large", "small",
5237 ... "small", "large", "small", "small",
5238 ... "large"],
5239 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
5240 >>> df
5241 A B C D
5242 0 foo one small 1
5243 1 foo one large 2
5244 2 foo one large 2
5245 3 foo two small 3
5246 4 foo two small 3
5247 5 bar one large 4
5248 6 bar one small 5
5249 7 bar two small 6
5250 8 bar two large 7
5251
5252 >>> table = pivot_table(df, values='D', index=['A', 'B'],
5253 ... columns=['C'], aggfunc=np.sum)
5254 >>> table
5255 C large small
5256 A B
5257 bar one 4.0 5.0
5258 two 7.0 6.0
5259 foo one 4.0 1.0
5260 two NaN 6.0
5261
5262 >>> table = pivot_table(df, values='D', index=['A', 'B'],
5263 ... columns=['C'], aggfunc=np.sum)
5264 >>> table
5265 C large small
5266 A B
5267 bar one 4.0 5.0
5268 two 7.0 6.0
5269 foo one 4.0 1.0
5270 two NaN 6.0
5271
5272 >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'],
5273 ... aggfunc={'D': np.mean,
5274 ... 'E': [min, max, np.mean]})
5275 >>> table
5276 D E
5277 mean max median min
5278 A C
5279 bar large 5.500000 16 14.5 13
5280 small 5.500000 15 14.5 14
5281 foo large 2.000000 10 9.5 9
5282 small 2.333333 12 11.0 8
5283
5284 Returns
5285 -------
5286 table : DataFrame
5287
5288 See also
5289 --------
5290 DataFrame.pivot : pivot without aggregation that can handle
5291 non-numeric data
5292 """
5293
5294 @Substitution('')
5295 @Appender(_shared_docs['pivot_table'])
5296 def pivot_table(self, values=None, index=None, columns=None,
5297 aggfunc='mean', fill_value=None, margins=False,
5298 dropna=True, margins_name='All'):
5299 from pandas.core.reshape.pivot import pivot_table
5300 return pivot_table(self, values=values, index=index, columns=columns,
5301 aggfunc=aggfunc, fill_value=fill_value,
5302 margins=margins, dropna=dropna,
5303 margins_name=margins_name)
5304
5305 def stack(self, level=-1, dropna=True):
5306 """
5307 Stack the prescribed level(s) from columns to index.
5308
5309 Return a reshaped DataFrame or Series having a multi-level
5310 index with one or more new inner-most levels compared to the current
5311 DataFrame. The new inner-most levels are created by pivoting the
5312 columns of the current dataframe:
5313
5314 - if the columns have a single level, the output is a Series;
5315 - if the columns have multiple levels, the new index
5316 level(s) is (are) taken from the prescribed level(s) and
5317 the output is a DataFrame.
5318
5319 The new index levels are sorted.
5320
5321 Parameters
5322 ----------
5323 level : int, str, list, default -1
5324 Level(s) to stack from the column axis onto the index
5325 axis, defined as one index or label, or a list of indices
5326 or labels.
5327 dropna : bool, default True
5328 Whether to drop rows in the resulting Frame/Series with
5329 missing values. Stacking a column level onto the index
5330 axis can create combinations of index and column values
5331 that are missing from the original dataframe. See Examples
5332 section.
5333
5334 Returns
5335 -------
5336 DataFrame or Series
5337 Stacked dataframe or series.
5338
5339 See Also
5340 --------
5341 DataFrame.unstack : Unstack prescribed level(s) from index axis
5342 onto column axis.
5343 DataFrame.pivot : Reshape dataframe from long format to wide
5344 format.
5345 DataFrame.pivot_table : Create a spreadsheet-style pivot table
5346 as a DataFrame.
5347
5348 Notes
5349 -----
5350 The function is named by analogy with a collection of books
5351 being re-organised from being side by side on a horizontal
5352 position (the columns of the dataframe) to being stacked
5353 vertically on top of of each other (in the index of the
5354 dataframe).
5355
5356 Examples
5357 --------
5358 **Single level columns**
5359
5360 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
5361 ... index=['cat', 'dog'],
5362 ... columns=['weight', 'height'])
5363
5364 Stacking a dataframe with a single level column axis returns a Series:
5365
5366 >>> df_single_level_cols
5367 weight height
5368 cat 0 1
5369 dog 2 3
5370 >>> df_single_level_cols.stack()
5371 cat weight 0
5372 height 1
5373 dog weight 2
5374 height 3
5375 dtype: int64
5376
5377 **Multi level columns: simple case**
5378
5379 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
5380 ... ('weight', 'pounds')])
5381 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
5382 ... index=['cat', 'dog'],
5383 ... columns=multicol1)
5384
5385 Stacking a dataframe with a multi-level column axis:
5386
5387 >>> df_multi_level_cols1
5388 weight
5389 kg pounds
5390 cat 1 2
5391 dog 2 4
5392 >>> df_multi_level_cols1.stack()
5393 weight
5394 cat kg 1
5395 pounds 2
5396 dog kg 2
5397 pounds 4
5398
5399 **Missing values**
5400
5401 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
5402 ... ('height', 'm')])
5403 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
5404 ... index=['cat', 'dog'],
5405 ... columns=multicol2)
5406
5407 It is common to have missing values when stacking a dataframe
5408 with multi-level columns, as the stacked dataframe typically
5409 has more values than the original dataframe. Missing values
5410 are filled with NaNs:
5411
5412 >>> df_multi_level_cols2
5413 weight height
5414 kg m
5415 cat 1.0 2.0
5416 dog 3.0 4.0
5417 >>> df_multi_level_cols2.stack()
5418 height weight
5419 cat kg NaN 1.0
5420 m 2.0 NaN
5421 dog kg NaN 3.0
5422 m 4.0 NaN
5423
5424 **Prescribing the level(s) to be stacked**
5425
5426 The first parameter controls which level or levels are stacked:
5427
5428 >>> df_multi_level_cols2.stack(0)
5429 kg m
5430 cat height NaN 2.0
5431 weight 1.0 NaN
5432 dog height NaN 4.0
5433 weight 3.0 NaN
5434 >>> df_multi_level_cols2.stack([0, 1])
5435 cat height m 2.0
5436 weight kg 1.0
5437 dog height m 4.0
5438 weight kg 3.0
5439 dtype: float64
5440
5441 **Dropping missing values**
5442
5443 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
5444 ... index=['cat', 'dog'],
5445 ... columns=multicol2)
5446
5447 Note that rows where all values are missing are dropped by
5448 default but this behaviour can be controlled via the dropna
5449 keyword parameter:
5450
5451 >>> df_multi_level_cols3
5452 weight height
5453 kg m
5454 cat NaN 1.0
5455 dog 2.0 3.0
5456 >>> df_multi_level_cols3.stack(dropna=False)
5457 height weight
5458 cat kg NaN NaN
5459 m 1.0 NaN
5460 dog kg NaN 2.0
5461 m 3.0 NaN
5462 >>> df_multi_level_cols3.stack(dropna=True)
5463 height weight
5464 cat m 1.0 NaN
5465 dog kg NaN 2.0
5466 m 3.0 NaN
5467 """
5468 from pandas.core.reshape.reshape import stack, stack_multiple
5469
5470 if isinstance(level, (tuple, list)):
5471 return stack_multiple(self, level, dropna=dropna)
5472 else:
5473 return stack(self, level, dropna=dropna)
5474
5475 def unstack(self, level=-1, fill_value=None):
5476 """
5477 Pivot a level of the (necessarily hierarchical) index labels, returning
5478 a DataFrame having a new level of column labels whose inner-most level
5479 consists of the pivoted index labels. If the index is not a MultiIndex,
5480 the output will be a Series (the analogue of stack when the columns are
5481 not a MultiIndex).
5482 The level involved will automatically get sorted.
5483
5484 Parameters
5485 ----------
5486 level : int, string, or list of these, default -1 (last level)
5487 Level(s) of index to unstack, can pass level name
5488 fill_value : replace NaN with this value if the unstack produces
5489 missing values
5490
5491 .. versionadded:: 0.18.0
5492
5493 See also
5494 --------
5495 DataFrame.pivot : Pivot a table based on column values.
5496 DataFrame.stack : Pivot a level of the column labels (inverse operation
5497 from `unstack`).
5498
5499 Examples
5500 --------
5501 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
5502 ... ('two', 'a'), ('two', 'b')])
5503 >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
5504 >>> s
5505 one a 1.0
5506 b 2.0
5507 two a 3.0
5508 b 4.0
5509 dtype: float64
5510
5511 >>> s.unstack(level=-1)
5512 a b
5513 one 1.0 2.0
5514 two 3.0 4.0
5515
5516 >>> s.unstack(level=0)
5517 one two
5518 a 1.0 3.0
5519 b 2.0 4.0
5520
5521 >>> df = s.unstack(level=0)
5522 >>> df.unstack()
5523 one a 1.0
5524 b 2.0
5525 two a 3.0
5526 b 4.0
5527 dtype: float64
5528
5529 Returns
5530 -------
5531 unstacked : DataFrame or Series
5532 """
5533 from pandas.core.reshape.reshape import unstack
5534 return unstack(self, level, fill_value)
5535
5536 _shared_docs['melt'] = ("""
5537 "Unpivots" a DataFrame from wide format to long format, optionally
5538 leaving identifier variables set.
5539
5540 This function is useful to massage a DataFrame into a format where one
5541 or more columns are identifier variables (`id_vars`), while all other
5542 columns, considered measured variables (`value_vars`), are "unpivoted" to
5543 the row axis, leaving just two non-identifier columns, 'variable' and
5544 'value'.
5545
5546 %(versionadded)s
5547 Parameters
5548 ----------
5549 frame : DataFrame
5550 id_vars : tuple, list, or ndarray, optional
5551 Column(s) to use as identifier variables.
5552 value_vars : tuple, list, or ndarray, optional
5553 Column(s) to unpivot. If not specified, uses all columns that
5554 are not set as `id_vars`.
5555 var_name : scalar
5556 Name to use for the 'variable' column. If None it uses
5557 ``frame.columns.name`` or 'variable'.
5558 value_name : scalar, default 'value'
5559 Name to use for the 'value' column.
5560 col_level : int or string, optional
5561 If columns are a MultiIndex then use this level to melt.
5562
5563 See also
5564 --------
5565 %(other)s
5566 pivot_table
5567 DataFrame.pivot
5568
5569 Examples
5570 --------
5571 >>> import pandas as pd
5572 >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
5573 ... 'B': {0: 1, 1: 3, 2: 5},
5574 ... 'C': {0: 2, 1: 4, 2: 6}})
5575 >>> df
5576 A B C
5577 0 a 1 2
5578 1 b 3 4
5579 2 c 5 6
5580
5581 >>> %(caller)sid_vars=['A'], value_vars=['B'])
5582 A variable value
5583 0 a B 1
5584 1 b B 3
5585 2 c B 5
5586
5587 >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
5588 A variable value
5589 0 a B 1
5590 1 b B 3
5591 2 c B 5
5592 3 a C 2
5593 4 b C 4
5594 5 c C 6
5595
5596 The names of 'variable' and 'value' columns can be customized:
5597
5598 >>> %(caller)sid_vars=['A'], value_vars=['B'],
5599 ... var_name='myVarname', value_name='myValname')
5600 A myVarname myValname
5601 0 a B 1
5602 1 b B 3
5603 2 c B 5
5604
5605 If you have multi-index columns:
5606
5607 >>> df.columns = [list('ABC'), list('DEF')]
5608 >>> df
5609 A B C
5610 D E F
5611 0 a 1 2
5612 1 b 3 4
5613 2 c 5 6
5614
5615 >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
5616 A variable value
5617 0 a B 1
5618 1 b B 3
5619 2 c B 5
5620
5621 >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
5622 (A, D) variable_0 variable_1 value
5623 0 a B E 1
5624 1 b B E 3
5625 2 c B E 5
5626
5627 """)
5628
5629 @Appender(_shared_docs['melt'] %
5630 dict(caller='df.melt(',
5631 versionadded='.. versionadded:: 0.20.0\n',
5632 other='melt'))
5633 def melt(self, id_vars=None, value_vars=None, var_name=None,
5634 value_name='value', col_level=None):
5635 from pandas.core.reshape.melt import melt
5636 return melt(self, id_vars=id_vars, value_vars=value_vars,
5637 var_name=var_name, value_name=value_name,
5638 col_level=col_level)
5639
5640 # ----------------------------------------------------------------------
5641 # Time series-related
5642
5643 def diff(self, periods=1, axis=0):
5644 """
5645 First discrete difference of element.
5646
5647 Calculates the difference of a DataFrame element compared with another
5648 element in the DataFrame (default is the element in the same column
5649 of the previous row).
5650
5651 Parameters
5652 ----------
5653 periods : int, default 1
5654 Periods to shift for calculating difference, accepts negative
5655 values.
5656 axis : {0 or 'index', 1 or 'columns'}, default 0
5657 Take difference over rows (0) or columns (1).
5658
5659 .. versionadded:: 0.16.1.
5660
5661 Returns
5662 -------
5663 diffed : DataFrame
5664
5665 See Also
5666 --------
5667 Series.diff: First discrete difference for a Series.
5668 DataFrame.pct_change: Percent change over given number of periods.
5669 DataFrame.shift: Shift index by desired number of periods with an
5670 optional time freq.
5671
5672 Examples
5673 --------
5674 Difference with previous row
5675
5676 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
5677 ... 'b': [1, 1, 2, 3, 5, 8],
5678 ... 'c': [1, 4, 9, 16, 25, 36]})
5679 >>> df
5680 a b c
5681 0 1 1 1
5682 1 2 1 4
5683 2 3 2 9
5684 3 4 3 16
5685 4 5 5 25
5686 5 6 8 36
5687
5688 >>> df.diff()
5689 a b c
5690 0 NaN NaN NaN
5691 1 1.0 0.0 3.0
5692 2 1.0 1.0 5.0
5693 3 1.0 1.0 7.0
5694 4 1.0 2.0 9.0
5695 5 1.0 3.0 11.0
5696
5697 Difference with previous column
5698
5699 >>> df.diff(axis=1)
5700 a b c
5701 0 NaN 0.0 0.0
5702 1 NaN -1.0 3.0
5703 2 NaN -1.0 7.0
5704 3 NaN -1.0 13.0
5705 4 NaN 0.0 20.0
5706 5 NaN 2.0 28.0
5707
5708 Difference with 3rd previous row
5709
5710 >>> df.diff(periods=3)
5711 a b c
5712 0 NaN NaN NaN
5713 1 NaN NaN NaN
5714 2 NaN NaN NaN
5715 3 3.0 2.0 15.0
5716 4 3.0 4.0 21.0
5717 5 3.0 6.0 27.0
5718
5719 Difference with following row
5720
5721 >>> df.diff(periods=-1)
5722 a b c
5723 0 -1.0 0.0 -3.0
5724 1 -1.0 -1.0 -5.0
5725 2 -1.0 -1.0 -7.0
5726 3 -1.0 -2.0 -9.0
5727 4 -1.0 -3.0 -11.0
5728 5 NaN NaN NaN
5729 """
5730 bm_axis = self._get_block_manager_axis(axis)
5731 new_data = self._data.diff(n=periods, axis=bm_axis)
5732 return self._constructor(new_data)
5733
5734 # ----------------------------------------------------------------------
5735 # Function application
5736
5737 def _gotitem(self,
5738 key, # type: Union[str, List[str]]
5739 ndim, # type: int
5740 subset=None # type: Union[Series, DataFrame, None]
5741 ):
5742 # type: (...) -> Union[Series, DataFrame]
5743 """
5744 sub-classes to define
5745 return a sliced object
5746
5747 Parameters
5748 ----------
5749 key : string / list of selections
5750 ndim : 1,2
5751 requested ndim of result
5752 subset : object, default None
5753 subset to act on
5754 """
5755 if subset is None:
5756 subset = self
5757 elif subset.ndim == 1: # is Series
5758 return subset
5759
5760 # TODO: _shallow_copy(subset)?
5761 return subset[key]
5762
5763 _agg_doc = dedent("""
5764 The aggregation operations are always performed over an axis, either the
5765 index (default) or the column axis. This behavior is different from
5766 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
5767 `var`), where the default is to compute the aggregation of the flattened
5768 array, e.g., ``numpy.mean(arr_2d)`` as opposed to ``numpy.mean(arr_2d,
5769 axis=0)``.
5770
5771 `agg` is an alias for `aggregate`. Use the alias.
5772
5773 Examples
5774 --------
5775 >>> df = pd.DataFrame([[1, 2, 3],
5776 ... [4, 5, 6],
5777 ... [7, 8, 9],
5778 ... [np.nan, np.nan, np.nan]],
5779 ... columns=['A', 'B', 'C'])
5780
5781 Aggregate these functions over the rows.
5782
5783 >>> df.agg(['sum', 'min'])
5784 A B C
5785 sum 12.0 15.0 18.0
5786 min 1.0 2.0 3.0
5787
5788 Different aggregations per column.
5789
5790 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
5791 A B
5792 max NaN 8.0
5793 min 1.0 2.0
5794 sum 12.0 NaN
5795
5796 Aggregate over the columns.
5797
5798 >>> df.agg("mean", axis="columns")
5799 0 2.0
5800 1 5.0
5801 2 8.0
5802 3 NaN
5803 dtype: float64
5804
5805 See also
5806 --------
5807 DataFrame.apply : Perform any type of operations.
5808 DataFrame.transform : Perform transformation type operations.
5809 pandas.core.groupby.GroupBy : Perform operations over groups.
5810 pandas.core.resample.Resampler : Perform operations over resampled bins.
5811 pandas.core.window.Rolling : Perform operations over rolling window.
5812 pandas.core.window.Expanding : Perform operations over expanding window.
5813 pandas.core.window.EWM : Perform operation over exponential weighted
5814 window.
5815 """)
5816
5817 @Appender(_agg_doc)
5818 @Appender(_shared_docs['aggregate'] % dict(
5819 versionadded='.. versionadded:: 0.20.0',
5820 **_shared_doc_kwargs))
5821 def aggregate(self, func, axis=0, *args, **kwargs):
5822 axis = self._get_axis_number(axis)
5823
5824 # TODO: flipped axis
5825 result = None
5826 if axis == 0:
5827 try:
5828 result, how = self._aggregate(func, axis=0, *args, **kwargs)
5829 except TypeError:
5830 pass
5831 if result is None:
5832 return self.apply(func, axis=axis, args=args, **kwargs)
5833 return result
5834
5835 agg = aggregate
5836
5837 def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
5838 result_type=None, args=(), **kwds):
5839 """
5840 Apply a function along an axis of the DataFrame.
5841
5842 Objects passed to the function are Series objects whose index is
5843 either the DataFrame's index (``axis=0``) or the DataFrame's columns
5844 (``axis=1``). By default (``result_type=None``), the final return type
5845 is inferred from the return type of the applied function. Otherwise,
5846 it depends on the `result_type` argument.
5847
5848 Parameters
5849 ----------
5850 func : function
5851 Function to apply to each column or row.
5852 axis : {0 or 'index', 1 or 'columns'}, default 0
5853 Axis along which the function is applied:
5854
5855 * 0 or 'index': apply function to each column.
5856 * 1 or 'columns': apply function to each row.
5857 broadcast : bool, optional
5858 Only relevant for aggregation functions:
5859
5860 * ``False`` or ``None`` : returns a Series whose length is the
5861 length of the index or the number of columns (based on the
5862 `axis` parameter)
5863 * ``True`` : results will be broadcast to the original shape
5864 of the frame, the original index and columns will be retained.
5865
5866 .. deprecated:: 0.23.0
5867 This argument will be removed in a future version, replaced
5868 by result_type='broadcast'.
5869
5870 raw : bool, default False
5871 * ``False`` : passes each row or column as a Series to the
5872 function.
5873 * ``True`` : the passed function will receive ndarray objects
5874 instead.
5875 If you are just applying a NumPy reduction function this will
5876 achieve much better performance.
5877 reduce : bool or None, default None
5878 Try to apply reduction procedures. If the DataFrame is empty,
5879 `apply` will use `reduce` to determine whether the result
5880 should be a Series or a DataFrame. If ``reduce=None`` (the
5881 default), `apply`'s return value will be guessed by calling
5882 `func` on an empty Series
5883 (note: while guessing, exceptions raised by `func` will be
5884 ignored).
5885 If ``reduce=True`` a Series will always be returned, and if
5886 ``reduce=False`` a DataFrame will always be returned.
5887
5888 .. deprecated:: 0.23.0
5889 This argument will be removed in a future version, replaced
5890 by ``result_type='reduce'``.
5891
5892 result_type : {'expand', 'reduce', 'broadcast', None}, default None
5893 These only act when ``axis=1`` (columns):
5894
5895 * 'expand' : list-like results will be turned into columns.
5896 * 'reduce' : returns a Series if possible rather than expanding
5897 list-like results. This is the opposite of 'expand'.
5898 * 'broadcast' : results will be broadcast to the original shape
5899 of the DataFrame, the original index and columns will be
5900 retained.
5901
5902 The default behaviour (None) depends on the return value of the
5903 applied function: list-like results will be returned as a Series
5904 of those. However if the apply function returns a Series these
5905 are expanded to columns.
5906
5907 .. versionadded:: 0.23.0
5908
5909 args : tuple
5910 Positional arguments to pass to `func` in addition to the
5911 array/series.
5912 **kwds
5913 Additional keyword arguments to pass as keywords arguments to
5914 `func`.
5915
5916 Notes
5917 -----
5918 In the current implementation apply calls `func` twice on the
5919 first column/row to decide whether it can take a fast or slow
5920 code path. This can lead to unexpected behavior if `func` has
5921 side-effects, as they will take effect twice for the first
5922 column/row.
5923
5924 See also
5925 --------
5926 DataFrame.applymap: For elementwise operations
5927 DataFrame.aggregate: only perform aggregating type operations
5928 DataFrame.transform: only perform transformating type operations
5929
5930 Examples
5931 --------
5932
5933 >>> df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
5934 >>> df
5935 A B
5936 0 4 9
5937 1 4 9
5938 2 4 9
5939
5940 Using a numpy universal function (in this case the same as
5941 ``np.sqrt(df)``):
5942
5943 >>> df.apply(np.sqrt)
5944 A B
5945 0 2.0 3.0
5946 1 2.0 3.0
5947 2 2.0 3.0
5948
5949 Using a reducing function on either axis
5950
5951 >>> df.apply(np.sum, axis=0)
5952 A 12
5953 B 27
5954 dtype: int64
5955
5956 >>> df.apply(np.sum, axis=1)
5957 0 13
5958 1 13
5959 2 13
5960 dtype: int64
5961
5962 Retuning a list-like will result in a Series
5963
5964 >>> df.apply(lambda x: [1, 2], axis=1)
5965 0 [1, 2]
5966 1 [1, 2]
5967 2 [1, 2]
5968 dtype: object
5969
5970 Passing result_type='expand' will expand list-like results
5971 to columns of a Dataframe
5972
5973 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
5974 0 1
5975 0 1 2
5976 1 1 2
5977 2 1 2
5978
5979 Returning a Series inside the function is similar to passing
5980 ``result_type='expand'``. The resulting column names
5981 will be the Series index.
5982
5983 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
5984 foo bar
5985 0 1 2
5986 1 1 2
5987 2 1 2
5988
5989 Passing ``result_type='broadcast'`` will ensure the same shape
5990 result, whether list-like or scalar is returned by the function,
5991 and broadcast it along the axis. The resulting column names will
5992 be the originals.
5993
5994 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
5995 A B
5996 0 1 2
5997 1 1 2
5998 2 1 2
5999
6000 Returns
6001 -------
6002 applied : Series or DataFrame
6003 """
6004 from pandas.core.apply import frame_apply
6005 op = frame_apply(self,
6006 func=func,
6007 axis=axis,
6008 broadcast=broadcast,
6009 raw=raw,
6010 reduce=reduce,
6011 result_type=result_type,
6012 args=args,
6013 kwds=kwds)
6014 return op.get_result()
6015
6016 def applymap(self, func):
6017 """
6018 Apply a function to a Dataframe elementwise.
6019
6020 This method applies a function that accepts and returns a scalar
6021 to every element of a DataFrame.
6022
6023 Parameters
6024 ----------
6025 func : callable
6026 Python function, returns a single value from a single value.
6027
6028 Returns
6029 -------
6030 DataFrame
6031 Transformed DataFrame.
6032
6033 See also
6034 --------
6035 DataFrame.apply : Apply a function along input axis of DataFrame
6036
6037 Examples
6038 --------
6039 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
6040 >>> df
6041 0 1
6042 0 1.000 2.120
6043 1 3.356 4.567
6044
6045 >>> df.applymap(lambda x: len(str(x)))
6046 0 1
6047 0 3 4
6048 1 5 5
6049
6050 Note that a vectorized version of `func` often exists, which will
6051 be much faster. You could square each number elementwise.
6052
6053 >>> df.applymap(lambda x: x**2)
6054 0 1
6055 0 1.000000 4.494400
6056 1 11.262736 20.857489
6057
6058 But it's better to avoid applymap in that case.
6059
6060 >>> df ** 2
6061 0 1
6062 0 1.000000 4.494400
6063 1 11.262736 20.857489
6064 """
6065
6066 # if we have a dtype == 'M8[ns]', provide boxed values
6067 def infer(x):
6068 if x.empty:
6069 return lib.map_infer(x, func)
6070 return lib.map_infer(x.astype(object).values, func)
6071
6072 return self.apply(infer)
6073
6074 # ----------------------------------------------------------------------
6075 # Merging / joining methods
6076
6077 def append(self, other, ignore_index=False,
6078 verify_integrity=False, sort=None):
6079 """
6080 Append rows of `other` to the end of this frame, returning a new
6081 object. Columns not in this frame are added as new columns.
6082
6083 Parameters
6084 ----------
6085 other : DataFrame or Series/dict-like object, or list of these
6086 The data to append.
6087 ignore_index : boolean, default False
6088 If True, do not use the index labels.
6089 verify_integrity : boolean, default False
6090 If True, raise ValueError on creating index with duplicates.
6091 sort : boolean, default None
6092 Sort columns if the columns of `self` and `other` are not aligned.
6093 The default sorting is deprecated and will change to not-sorting
6094 in a future version of pandas. Explicitly pass ``sort=True`` to
6095 silence the warning and sort. Explicitly pass ``sort=False`` to
6096 silence the warning and not sort.
6097
6098 .. versionadded:: 0.23.0
6099
6100 Returns
6101 -------
6102 appended : DataFrame
6103
6104 Notes
6105 -----
6106 If a list of dict/series is passed and the keys are all contained in
6107 the DataFrame's index, the order of the columns in the resulting
6108 DataFrame will be unchanged.
6109
6110 Iteratively appending rows to a DataFrame can be more computationally
6111 intensive than a single concatenate. A better solution is to append
6112 those rows to a list and then concatenate the list with the original
6113 DataFrame all at once.
6114
6115 See also
6116 --------
6117 pandas.concat : General function to concatenate DataFrame, Series
6118 or Panel objects
6119
6120 Examples
6121 --------
6122
6123 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
6124 >>> df
6125 A B
6126 0 1 2
6127 1 3 4
6128 >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
6129 >>> df.append(df2)
6130 A B
6131 0 1 2
6132 1 3 4
6133 0 5 6
6134 1 7 8
6135
6136 With `ignore_index` set to True:
6137
6138 >>> df.append(df2, ignore_index=True)
6139 A B
6140 0 1 2
6141 1 3 4
6142 2 5 6
6143 3 7 8
6144
6145 The following, while not recommended methods for generating DataFrames,
6146 show two ways to generate a DataFrame from multiple data sources.
6147
6148 Less efficient:
6149
6150 >>> df = pd.DataFrame(columns=['A'])
6151 >>> for i in range(5):
6152 ... df = df.append({'A': i}, ignore_index=True)
6153 >>> df
6154 A
6155 0 0
6156 1 1
6157 2 2
6158 3 3
6159 4 4
6160
6161 More efficient:
6162
6163 >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
6164 ... ignore_index=True)
6165 A
6166 0 0
6167 1 1
6168 2 2
6169 3 3
6170 4 4
6171
6172 """
6173 if isinstance(other, (Series, dict)):
6174 if isinstance(other, dict):
6175 other = Series(other)
6176 if other.name is None and not ignore_index:
6177 raise TypeError('Can only append a Series if ignore_index=True'
6178 ' or if the Series has a name')
6179
6180 if other.name is None:
6181 index = None
6182 else:
6183 # other must have the same index name as self, otherwise
6184 # index name will be reset
6185 index = Index([other.name], name=self.index.name)
6186
6187 idx_diff = other.index.difference(self.columns)
6188 try:
6189 combined_columns = self.columns.append(idx_diff)
6190 except TypeError:
6191 combined_columns = self.columns.astype(object).append(idx_diff)
6192 other = other.reindex(combined_columns, copy=False)
6193 other = DataFrame(other.values.reshape((1, len(other))),
6194 index=index,
6195 columns=combined_columns)
6196 other = other._convert(datetime=True, timedelta=True)
6197 if not self.columns.equals(combined_columns):
6198 self = self.reindex(columns=combined_columns)
6199 elif isinstance(other, list) and not isinstance(other[0], DataFrame):
6200 other = DataFrame(other)
6201 if (self.columns.get_indexer(other.columns) >= 0).all():
6202 other = other.loc[:, self.columns]
6203
6204 from pandas.core.reshape.concat import concat
6205 if isinstance(other, (list, tuple)):
6206 to_concat = [self] + other
6207 else:
6208 to_concat = [self, other]
6209 return concat(to_concat, ignore_index=ignore_index,
6210 verify_integrity=verify_integrity,
6211 sort=sort)
6212
6213 def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
6214 sort=False):
6215 """
6216 Join columns with other DataFrame either on index or on a key
6217 column. Efficiently Join multiple DataFrame objects by index at once by
6218 passing a list.
6219
6220 Parameters
6221 ----------
6222 other : DataFrame, Series with name field set, or list of DataFrame
6223 Index should be similar to one of the columns in this one. If a
6224 Series is passed, its name attribute must be set, and that will be
6225 used as the column name in the resulting joined DataFrame
6226 on : name, tuple/list of names, or array-like
6227 Column or index level name(s) in the caller to join on the index
6228 in `other`, otherwise joins index-on-index. If multiple
6229 values given, the `other` DataFrame must have a MultiIndex. Can
6230 pass an array as the join key if it is not already contained in
6231 the calling DataFrame. Like an Excel VLOOKUP operation
6232 how : {'left', 'right', 'outer', 'inner'}, default: 'left'
6233 How to handle the operation of the two objects.
6234
6235 * left: use calling frame's index (or column if on is specified)
6236 * right: use other frame's index
6237 * outer: form union of calling frame's index (or column if on is
6238 specified) with other frame's index, and sort it
6239 lexicographically
6240 * inner: form intersection of calling frame's index (or column if
6241 on is specified) with other frame's index, preserving the order
6242 of the calling's one
6243 lsuffix : string
6244 Suffix to use from left frame's overlapping columns
6245 rsuffix : string
6246 Suffix to use from right frame's overlapping columns
6247 sort : boolean, default False
6248 Order result DataFrame lexicographically by the join key. If False,
6249 the order of the join key depends on the join type (how keyword)
6250
6251 Notes
6252 -----
6253 on, lsuffix, and rsuffix options are not supported when passing a list
6254 of DataFrame objects
6255
6256 Support for specifying index levels as the `on` parameter was added
6257 in version 0.23.0
6258
6259 Examples
6260 --------
6261 >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
6262 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
6263
6264 >>> caller
6265 A key
6266 0 A0 K0
6267 1 A1 K1
6268 2 A2 K2
6269 3 A3 K3
6270 4 A4 K4
6271 5 A5 K5
6272
6273 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
6274 ... 'B': ['B0', 'B1', 'B2']})
6275
6276 >>> other
6277 B key
6278 0 B0 K0
6279 1 B1 K1
6280 2 B2 K2
6281
6282 Join DataFrames using their indexes.
6283
6284 >>> caller.join(other, lsuffix='_caller', rsuffix='_other')
6285
6286 >>> A key_caller B key_other
6287 0 A0 K0 B0 K0
6288 1 A1 K1 B1 K1
6289 2 A2 K2 B2 K2
6290 3 A3 K3 NaN NaN
6291 4 A4 K4 NaN NaN
6292 5 A5 K5 NaN NaN
6293
6294
6295 If we want to join using the key columns, we need to set key to be
6296 the index in both caller and other. The joined DataFrame will have
6297 key as its index.
6298
6299 >>> caller.set_index('key').join(other.set_index('key'))
6300
6301 >>> A B
6302 key
6303 K0 A0 B0
6304 K1 A1 B1
6305 K2 A2 B2
6306 K3 A3 NaN
6307 K4 A4 NaN
6308 K5 A5 NaN
6309
6310 Another option to join using the key columns is to use the on
6311 parameter. DataFrame.join always uses other's index but we can use any
6312 column in the caller. This method preserves the original caller's
6313 index in the result.
6314
6315 >>> caller.join(other.set_index('key'), on='key')
6316
6317 >>> A key B
6318 0 A0 K0 B0
6319 1 A1 K1 B1
6320 2 A2 K2 B2
6321 3 A3 K3 NaN
6322 4 A4 K4 NaN
6323 5 A5 K5 NaN
6324
6325
6326 See also
6327 --------
6328 DataFrame.merge : For column(s)-on-columns(s) operations
6329
6330 Returns
6331 -------
6332 joined : DataFrame
6333 """
6334 # For SparseDataFrame's benefit
6335 return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
6336 rsuffix=rsuffix, sort=sort)
6337
6338 def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
6339 sort=False):
6340 from pandas.core.reshape.merge import merge
6341 from pandas.core.reshape.concat import concat
6342
6343 if isinstance(other, Series):
6344 if other.name is None:
6345 raise ValueError('Other Series must have a name')
6346 other = DataFrame({other.name: other})
6347
6348 if isinstance(other, DataFrame):
6349 return merge(self, other, left_on=on, how=how,
6350 left_index=on is None, right_index=True,
6351 suffixes=(lsuffix, rsuffix), sort=sort)
6352 else:
6353 if on is not None:
6354 raise ValueError('Joining multiple DataFrames only supported'
6355 ' for joining on index')
6356
6357 frames = [self] + list(other)
6358
6359 can_concat = all(df.index.is_unique for df in frames)
6360
6361 # join indexes only using concat
6362 if can_concat:
6363 if how == 'left':
6364 how = 'outer'
6365 join_axes = [self.index]
6366 else:
6367 join_axes = None
6368 return concat(frames, axis=1, join=how, join_axes=join_axes,
6369 verify_integrity=True)
6370
6371 joined = frames[0]
6372
6373 for frame in frames[1:]:
6374 joined = merge(joined, frame, how=how, left_index=True,
6375 right_index=True)
6376
6377 return joined
6378
6379 @Substitution('')
6380 @Appender(_merge_doc, indents=2)
6381 def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
6382 left_index=False, right_index=False, sort=False,
6383 suffixes=('_x', '_y'), copy=True, indicator=False,
6384 validate=None):
6385 from pandas.core.reshape.merge import merge
6386 return merge(self, right, how=how, on=on, left_on=left_on,
6387 right_on=right_on, left_index=left_index,
6388 right_index=right_index, sort=sort, suffixes=suffixes,
6389 copy=copy, indicator=indicator, validate=validate)
6390
6391 def round(self, decimals=0, *args, **kwargs):
6392 """
6393 Round a DataFrame to a variable number of decimal places.
6394
6395 Parameters
6396 ----------
6397 decimals : int, dict, Series
6398 Number of decimal places to round each column to. If an int is
6399 given, round each column to the same number of places.
6400 Otherwise dict and Series round to variable numbers of places.
6401 Column names should be in the keys if `decimals` is a
6402 dict-like, or in the index if `decimals` is a Series. Any
6403 columns not included in `decimals` will be left as is. Elements
6404 of `decimals` which are not columns of the input will be
6405 ignored.
6406
6407 Examples
6408 --------
6409 >>> df = pd.DataFrame(np.random.random([3, 3]),
6410 ... columns=['A', 'B', 'C'], index=['first', 'second', 'third'])
6411 >>> df
6412 A B C
6413 first 0.028208 0.992815 0.173891
6414 second 0.038683 0.645646 0.577595
6415 third 0.877076 0.149370 0.491027
6416 >>> df.round(2)
6417 A B C
6418 first 0.03 0.99 0.17
6419 second 0.04 0.65 0.58
6420 third 0.88 0.15 0.49
6421 >>> df.round({'A': 1, 'C': 2})
6422 A B C
6423 first 0.0 0.992815 0.17
6424 second 0.0 0.645646 0.58
6425 third 0.9 0.149370 0.49
6426 >>> decimals = pd.Series([1, 0, 2], index=['A', 'B', 'C'])
6427 >>> df.round(decimals)
6428 A B C
6429 first 0.0 1 0.17
6430 second 0.0 1 0.58
6431 third 0.9 0 0.49
6432
6433 Returns
6434 -------
6435 DataFrame object
6436
6437 See Also
6438 --------
6439 numpy.around
6440 Series.round
6441
6442 """
6443 from pandas.core.reshape.concat import concat
6444
6445 def _dict_round(df, decimals):
6446 for col, vals in df.iteritems():
6447 try:
6448 yield _series_round(vals, decimals[col])
6449 except KeyError:
6450 yield vals
6451
6452 def _series_round(s, decimals):
6453 if is_integer_dtype(s) or is_float_dtype(s):
6454 return s.round(decimals)
6455 return s
6456
6457 nv.validate_round(args, kwargs)
6458
6459 if isinstance(decimals, (dict, Series)):
6460 if isinstance(decimals, Series):
6461 if not decimals.index.is_unique:
6462 raise ValueError("Index of decimals must be unique")
6463 new_cols = [col for col in _dict_round(self, decimals)]
6464 elif is_integer(decimals):
6465 # Dispatch to Series.round
6466 new_cols = [_series_round(v, decimals)
6467 for _, v in self.iteritems()]
6468 else:
6469 raise TypeError("decimals must be an integer, a dict-like or a "
6470 "Series")
6471
6472 if len(new_cols) > 0:
6473 return self._constructor(concat(new_cols, axis=1),
6474 index=self.index,
6475 columns=self.columns)
6476 else:
6477 return self
6478
6479 # ----------------------------------------------------------------------
6480 # Statistical methods, etc.
6481
6482 def corr(self, method='pearson', min_periods=1):
6483 """
6484 Compute pairwise correlation of columns, excluding NA/null values
6485
6486 Parameters
6487 ----------
6488 method : {'pearson', 'kendall', 'spearman'}
6489 * pearson : standard correlation coefficient
6490 * kendall : Kendall Tau correlation coefficient
6491 * spearman : Spearman rank correlation
6492 min_periods : int, optional
6493 Minimum number of observations required per pair of columns
6494 to have a valid result. Currently only available for pearson
6495 and spearman correlation
6496
6497 Returns
6498 -------
6499 y : DataFrame
6500 """
6501 numeric_df = self._get_numeric_data()
6502 cols = numeric_df.columns
6503 idx = cols.copy()
6504 mat = numeric_df.values
6505
6506 if method == 'pearson':
6507 correl = libalgos.nancorr(_ensure_float64(mat), minp=min_periods)
6508 elif method == 'spearman':
6509 correl = libalgos.nancorr_spearman(_ensure_float64(mat),
6510 minp=min_periods)
6511 else:
6512 if min_periods is None:
6513 min_periods = 1
6514 mat = _ensure_float64(mat).T
6515 corrf = nanops.get_corr_func(method)
6516 K = len(cols)
6517 correl = np.empty((K, K), dtype=float)
6518 mask = np.isfinite(mat)
6519 for i, ac in enumerate(mat):
6520 for j, bc in enumerate(mat):
6521 if i > j:
6522 continue
6523
6524 valid = mask[i] & mask[j]
6525 if valid.sum() < min_periods:
6526 c = np.nan
6527 elif i == j:
6528 c = 1.
6529 elif not valid.all():
6530 c = corrf(ac[valid], bc[valid])
6531 else:
6532 c = corrf(ac, bc)
6533 correl[i, j] = c
6534 correl[j, i] = c
6535
6536 return self._constructor(correl, index=idx, columns=cols)
6537
6538 def cov(self, min_periods=None):
6539 """
6540 Compute pairwise covariance of columns, excluding NA/null values.
6541
6542 Compute the pairwise covariance among the series of a DataFrame.
6543 The returned data frame is the `covariance matrix
6544 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
6545 of the DataFrame.
6546
6547 Both NA and null values are automatically excluded from the
6548 calculation. (See the note below about bias from missing values.)
6549 A threshold can be set for the minimum number of
6550 observations for each value created. Comparisons with observations
6551 below this threshold will be returned as ``NaN``.
6552
6553 This method is generally used for the analysis of time series data to
6554 understand the relationship between different measures
6555 across time.
6556
6557 Parameters
6558 ----------
6559 min_periods : int, optional
6560 Minimum number of observations required per pair of columns
6561 to have a valid result.
6562
6563 Returns
6564 -------
6565 DataFrame
6566 The covariance matrix of the series of the DataFrame.
6567
6568 See Also
6569 --------
6570 pandas.Series.cov : compute covariance with another Series
6571 pandas.core.window.EWM.cov: expoential weighted sample covariance
6572 pandas.core.window.Expanding.cov : expanding sample covariance
6573 pandas.core.window.Rolling.cov : rolling sample covariance
6574
6575 Notes
6576 -----
6577 Returns the covariance matrix of the DataFrame's time series.
6578 The covariance is normalized by N-1.
6579
6580 For DataFrames that have Series that are missing data (assuming that
6581 data is `missing at random
6582 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
6583 the returned covariance matrix will be an unbiased estimate
6584 of the variance and covariance between the member Series.
6585
6586 However, for many applications this estimate may not be acceptable
6587 because the estimate covariance matrix is not guaranteed to be positive
6588 semi-definite. This could lead to estimate correlations having
6589 absolute values which are greater than one, and/or a non-invertible
6590 covariance matrix. See `Estimation of covariance matrices
6591 <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
6592 matrices>`__ for more details.
6593
6594 Examples
6595 --------
6596 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
6597 ... columns=['dogs', 'cats'])
6598 >>> df.cov()
6599 dogs cats
6600 dogs 0.666667 -1.000000
6601 cats -1.000000 1.666667
6602
6603 >>> np.random.seed(42)
6604 >>> df = pd.DataFrame(np.random.randn(1000, 5),
6605 ... columns=['a', 'b', 'c', 'd', 'e'])
6606 >>> df.cov()
6607 a b c d e
6608 a 0.998438 -0.020161 0.059277 -0.008943 0.014144
6609 b -0.020161 1.059352 -0.008543 -0.024738 0.009826
6610 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
6611 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
6612 e 0.014144 0.009826 -0.000271 -0.013692 0.977795
6613
6614 **Minimum number of periods**
6615
6616 This method also supports an optional ``min_periods`` keyword
6617 that specifies the required minimum number of non-NA observations for
6618 each column pair in order to have a valid result:
6619
6620 >>> np.random.seed(42)
6621 >>> df = pd.DataFrame(np.random.randn(20, 3),
6622 ... columns=['a', 'b', 'c'])
6623 >>> df.loc[df.index[:5], 'a'] = np.nan
6624 >>> df.loc[df.index[5:10], 'b'] = np.nan
6625 >>> df.cov(min_periods=12)
6626 a b c
6627 a 0.316741 NaN -0.150812
6628 b NaN 1.248003 0.191417
6629 c -0.150812 0.191417 0.895202
6630 """
6631 numeric_df = self._get_numeric_data()
6632 cols = numeric_df.columns
6633 idx = cols.copy()
6634 mat = numeric_df.values
6635
6636 if notna(mat).all():
6637 if min_periods is not None and min_periods > len(mat):
6638 baseCov = np.empty((mat.shape[1], mat.shape[1]))
6639 baseCov.fill(np.nan)
6640 else:
6641 baseCov = np.cov(mat.T)
6642 baseCov = baseCov.reshape((len(cols), len(cols)))
6643 else:
6644 baseCov = libalgos.nancorr(_ensure_float64(mat), cov=True,
6645 minp=min_periods)
6646
6647 return self._constructor(baseCov, index=idx, columns=cols)
6648
6649 def corrwith(self, other, axis=0, drop=False):
6650 """
6651 Compute pairwise correlation between rows or columns of two DataFrame
6652 objects.
6653
6654 Parameters
6655 ----------
6656 other : DataFrame, Series
6657 axis : {0 or 'index', 1 or 'columns'}, default 0
6658 0 or 'index' to compute column-wise, 1 or 'columns' for row-wise
6659 drop : boolean, default False
6660 Drop missing indices from result, default returns union of all
6661
6662 Returns
6663 -------
6664 correls : Series
6665 """
6666 axis = self._get_axis_number(axis)
6667 this = self._get_numeric_data()
6668
6669 if isinstance(other, Series):
6670 return this.apply(other.corr, axis=axis)
6671
6672 other = other._get_numeric_data()
6673
6674 left, right = this.align(other, join='inner', copy=False)
6675
6676 # mask missing values
6677 left = left + right * 0
6678 right = right + left * 0
6679
6680 if axis == 1:
6681 left = left.T
6682 right = right.T
6683
6684 # demeaned data
6685 ldem = left - left.mean()
6686 rdem = right - right.mean()
6687
6688 num = (ldem * rdem).sum()
6689 dom = (left.count() - 1) * left.std() * right.std()
6690
6691 correl = num / dom
6692
6693 if not drop:
6694 raxis = 1 if axis == 0 else 0
6695 result_index = this._get_axis(raxis).union(other._get_axis(raxis))
6696 correl = correl.reindex(result_index)
6697
6698 return correl
6699
6700 # ----------------------------------------------------------------------
6701 # ndarray-like stats methods
6702
6703 def count(self, axis=0, level=None, numeric_only=False):
6704 """
6705 Count non-NA cells for each column or row.
6706
6707 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
6708 on `pandas.options.mode.use_inf_as_na`) are considered NA.
6709
6710 Parameters
6711 ----------
6712 axis : {0 or 'index', 1 or 'columns'}, default 0
6713 If 0 or 'index' counts are generated for each column.
6714 If 1 or 'columns' counts are generated for each **row**.
6715 level : int or str, optional
6716 If the axis is a `MultiIndex` (hierarchical), count along a
6717 particular `level`, collapsing into a `DataFrame`.
6718 A `str` specifies the level name.
6719 numeric_only : boolean, default False
6720 Include only `float`, `int` or `boolean` data.
6721
6722 Returns
6723 -------
6724 Series or DataFrame
6725 For each column/row the number of non-NA/null entries.
6726 If `level` is specified returns a `DataFrame`.
6727
6728 See Also
6729 --------
6730 Series.count: number of non-NA elements in a Series
6731 DataFrame.shape: number of DataFrame rows and columns (including NA
6732 elements)
6733 DataFrame.isna: boolean same-sized DataFrame showing places of NA
6734 elements
6735
6736 Examples
6737 --------
6738 Constructing DataFrame from a dictionary:
6739
6740 >>> df = pd.DataFrame({"Person":
6741 ... ["John", "Myla", None, "John", "Myla"],
6742 ... "Age": [24., np.nan, 21., 33, 26],
6743 ... "Single": [False, True, True, True, False]})
6744 >>> df
6745 Person Age Single
6746 0 John 24.0 False
6747 1 Myla NaN True
6748 2 None 21.0 True
6749 3 John 33.0 True
6750 4 Myla 26.0 False
6751
6752 Notice the uncounted NA values:
6753
6754 >>> df.count()
6755 Person 4
6756 Age 4
6757 Single 5
6758 dtype: int64
6759
6760 Counts for each **row**:
6761
6762 >>> df.count(axis='columns')
6763 0 3
6764 1 2
6765 2 2
6766 3 3
6767 4 3
6768 dtype: int64
6769
6770 Counts for one level of a `MultiIndex`:
6771
6772 >>> df.set_index(["Person", "Single"]).count(level="Person")
6773 Age
6774 Person
6775 John 2
6776 Myla 1
6777 """
6778 axis = self._get_axis_number(axis)
6779 if level is not None:
6780 return self._count_level(level, axis=axis,
6781 numeric_only=numeric_only)
6782
6783 if numeric_only:
6784 frame = self._get_numeric_data()
6785 else:
6786 frame = self
6787
6788 # GH #423
6789 if len(frame._get_axis(axis)) == 0:
6790 result = Series(0, index=frame._get_agg_axis(axis))
6791 else:
6792 if frame._is_mixed_type or frame._data.any_extension_types:
6793 # the or any_extension_types is really only hit for single-
6794 # column frames with an extension array
6795 result = notna(frame).sum(axis=axis)
6796 else:
6797 # GH13407
6798 series_counts = notna(frame).sum(axis=axis)
6799 counts = series_counts.values
6800 result = Series(counts, index=frame._get_agg_axis(axis))
6801
6802 return result.astype('int64')
6803
6804 def _count_level(self, level, axis=0, numeric_only=False):
6805 if numeric_only:
6806 frame = self._get_numeric_data()
6807 else:
6808 frame = self
6809
6810 count_axis = frame._get_axis(axis)
6811 agg_axis = frame._get_agg_axis(axis)
6812
6813 if not isinstance(count_axis, MultiIndex):
6814 raise TypeError("Can only count levels on hierarchical "
6815 "{ax}.".format(ax=self._get_axis_name(axis)))
6816
6817 if frame._is_mixed_type:
6818 # Since we have mixed types, calling notna(frame.values) might
6819 # upcast everything to object
6820 mask = notna(frame).values
6821 else:
6822 # But use the speedup when we have homogeneous dtypes
6823 mask = notna(frame.values)
6824
6825 if axis == 1:
6826 # We're transposing the mask rather than frame to avoid potential
6827 # upcasts to object, which induces a ~20x slowdown
6828 mask = mask.T
6829
6830 if isinstance(level, compat.string_types):
6831 level = count_axis._get_level_number(level)
6832
6833 level_index = count_axis.levels[level]
6834 labels = _ensure_int64(count_axis.labels[level])
6835 counts = lib.count_level_2d(mask, labels, len(level_index), axis=0)
6836
6837 result = DataFrame(counts, index=level_index, columns=agg_axis)
6838
6839 if axis == 1:
6840 # Undo our earlier transpose
6841 return result.T
6842 else:
6843 return result
6844
6845 def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
6846 filter_type=None, **kwds):
6847 if axis is None and filter_type == 'bool':
6848 labels = None
6849 constructor = None
6850 else:
6851 # TODO: Make other agg func handle axis=None properly
6852 axis = self._get_axis_number(axis)
6853 labels = self._get_agg_axis(axis)
6854 constructor = self._constructor
6855
6856 def f(x):
6857 return op(x, axis=axis, skipna=skipna, **kwds)
6858
6859 # exclude timedelta/datetime unless we are uniform types
6860 if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type:
6861 numeric_only = True
6862
6863 if numeric_only is None:
6864 try:
6865 values = self.values
6866 result = f(values)
6867
6868 if (filter_type == 'bool' and is_object_dtype(values) and
6869 axis is None):
6870 # work around https://github.com/numpy/numpy/issues/10489
6871 # TODO: combine with hasattr(result, 'dtype') further down
6872 # hard since we don't have `values` down there.
6873 result = np.bool_(result)
6874 except Exception as e:
6875
6876 # try by-column first
6877 if filter_type is None and axis == 0:
6878 try:
6879
6880 # this can end up with a non-reduction
6881 # but not always. if the types are mixed
6882 # with datelike then need to make sure a series
6883
6884 # we only end up here if we have not specified
6885 # numeric_only and yet we have tried a
6886 # column-by-column reduction, where we have mixed type.
6887 # So let's just do what we can
6888 from pandas.core.apply import frame_apply
6889 opa = frame_apply(self,
6890 func=f,
6891 result_type='expand',
6892 ignore_failures=True)
6893 result = opa.get_result()
6894 if result.ndim == self.ndim:
6895 result = result.iloc[0]
6896 return result
6897 except Exception:
6898 pass
6899
6900 if filter_type is None or filter_type == 'numeric':
6901 data = self._get_numeric_data()
6902 elif filter_type == 'bool':
6903 data = self._get_bool_data()
6904 else: # pragma: no cover
6905 e = NotImplementedError(
6906 "Handling exception with filter_type {f} not"
6907 "implemented.".format(f=filter_type))
6908 raise_with_traceback(e)
6909 with np.errstate(all='ignore'):
6910 result = f(data.values)
6911 labels = data._get_agg_axis(axis)
6912 else:
6913 if numeric_only:
6914 if filter_type is None or filter_type == 'numeric':
6915 data = self._get_numeric_data()
6916 elif filter_type == 'bool':
6917 data = self._get_bool_data()
6918 else: # pragma: no cover
6919 msg = ("Generating numeric_only data with filter_type {f}"
6920 "not supported.".format(f=filter_type))
6921 raise NotImplementedError(msg)
6922 values = data.values
6923 labels = data._get_agg_axis(axis)
6924 else:
6925 values = self.values
6926 result = f(values)
6927
6928 if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
6929 try:
6930 if filter_type is None or filter_type == 'numeric':
6931 result = result.astype(np.float64)
6932 elif filter_type == 'bool' and notna(result).all():
6933 result = result.astype(np.bool_)
6934 except (ValueError, TypeError):
6935
6936 # try to coerce to the original dtypes item by item if we can
6937 if axis == 0:
6938 result = coerce_to_dtypes(result, self.dtypes)
6939
6940 if constructor is not None:
6941 result = Series(result, index=labels)
6942 return result
6943
6944 def nunique(self, axis=0, dropna=True):
6945 """
6946 Return Series with number of distinct observations over requested
6947 axis.
6948
6949 .. versionadded:: 0.20.0
6950
6951 Parameters
6952 ----------
6953 axis : {0 or 'index', 1 or 'columns'}, default 0
6954 dropna : boolean, default True
6955 Don't include NaN in the counts.
6956
6957 Returns
6958 -------
6959 nunique : Series
6960
6961 Examples
6962 --------
6963 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
6964 >>> df.nunique()
6965 A 3
6966 B 1
6967
6968 >>> df.nunique(axis=1)
6969 0 1
6970 1 2
6971 2 2
6972 """
6973 return self.apply(Series.nunique, axis=axis, dropna=dropna)
6974
6975 def idxmin(self, axis=0, skipna=True):
6976 """
6977 Return index of first occurrence of minimum over requested axis.
6978 NA/null values are excluded.
6979
6980 Parameters
6981 ----------
6982 axis : {0 or 'index', 1 or 'columns'}, default 0
6983 0 or 'index' for row-wise, 1 or 'columns' for column-wise
6984 skipna : boolean, default True
6985 Exclude NA/null values. If an entire row/column is NA, the result
6986 will be NA.
6987
6988 Raises
6989 ------
6990 ValueError
6991 * If the row/column is empty
6992
6993 Returns
6994 -------
6995 idxmin : Series
6996
6997 Notes
6998 -----
6999 This method is the DataFrame version of ``ndarray.argmin``.
7000
7001 See Also
7002 --------
7003 Series.idxmin
7004 """
7005 axis = self._get_axis_number(axis)
7006 indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
7007 index = self._get_axis(axis)
7008 result = [index[i] if i >= 0 else np.nan for i in indices]
7009 return Series(result, index=self._get_agg_axis(axis))
7010
7011 def idxmax(self, axis=0, skipna=True):
7012 """
7013 Return index of first occurrence of maximum over requested axis.
7014 NA/null values are excluded.
7015
7016 Parameters
7017 ----------
7018 axis : {0 or 'index', 1 or 'columns'}, default 0
7019 0 or 'index' for row-wise, 1 or 'columns' for column-wise
7020 skipna : boolean, default True
7021 Exclude NA/null values. If an entire row/column is NA, the result
7022 will be NA.
7023
7024 Raises
7025 ------
7026 ValueError
7027 * If the row/column is empty
7028
7029 Returns
7030 -------
7031 idxmax : Series
7032
7033 Notes
7034 -----
7035 This method is the DataFrame version of ``ndarray.argmax``.
7036
7037 See Also
7038 --------
7039 Series.idxmax
7040 """
7041 axis = self._get_axis_number(axis)
7042 indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
7043 index = self._get_axis(axis)
7044 result = [index[i] if i >= 0 else np.nan for i in indices]
7045 return Series(result, index=self._get_agg_axis(axis))
7046
7047 def _get_agg_axis(self, axis_num):
7048 """ let's be explicit about this """
7049 if axis_num == 0:
7050 return self.columns
7051 elif axis_num == 1:
7052 return self.index
7053 else:
7054 raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
7055
7056 def mode(self, axis=0, numeric_only=False):
7057 """
7058 Gets the mode(s) of each element along the axis selected. Adds a row
7059 for each mode per label, fills in gaps with nan.
7060
7061 Note that there could be multiple values returned for the selected
7062 axis (when more than one item share the maximum frequency), which is
7063 the reason why a dataframe is returned. If you want to impute missing
7064 values with the mode in a dataframe ``df``, you can just do this:
7065 ``df.fillna(df.mode().iloc[0])``
7066
7067 Parameters
7068 ----------
7069 axis : {0 or 'index', 1 or 'columns'}, default 0
7070 * 0 or 'index' : get mode of each column
7071 * 1 or 'columns' : get mode of each row
7072 numeric_only : boolean, default False
7073 if True, only apply to numeric columns
7074
7075 Returns
7076 -------
7077 modes : DataFrame (sorted)
7078
7079 Examples
7080 --------
7081 >>> df = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3]})
7082 >>> df.mode()
7083 A
7084 0 1
7085 1 2
7086 """
7087 data = self if not numeric_only else self._get_numeric_data()
7088
7089 def f(s):
7090 return s.mode()
7091
7092 return data.apply(f, axis=axis)
7093
7094 def quantile(self, q=0.5, axis=0, numeric_only=True,
7095 interpolation='linear'):
7096 """
7097 Return values at the given quantile over requested axis, a la
7098 numpy.percentile.
7099
7100 Parameters
7101 ----------
7102 q : float or array-like, default 0.5 (50% quantile)
7103 0 <= q <= 1, the quantile(s) to compute
7104 axis : {0, 1, 'index', 'columns'} (default 0)
7105 0 or 'index' for row-wise, 1 or 'columns' for column-wise
7106 numeric_only : boolean, default True
7107 If False, the quantile of datetime and timedelta data will be
7108 computed as well
7109 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
7110 .. versionadded:: 0.18.0
7111
7112 This optional parameter specifies the interpolation method to use,
7113 when the desired quantile lies between two data points `i` and `j`:
7114
7115 * linear: `i + (j - i) * fraction`, where `fraction` is the
7116 fractional part of the index surrounded by `i` and `j`.
7117 * lower: `i`.
7118 * higher: `j`.
7119 * nearest: `i` or `j` whichever is nearest.
7120 * midpoint: (`i` + `j`) / 2.
7121
7122 Returns
7123 -------
7124 quantiles : Series or DataFrame
7125
7126 - If ``q`` is an array, a DataFrame will be returned where the
7127 index is ``q``, the columns are the columns of self, and the
7128 values are the quantiles.
7129 - If ``q`` is a float, a Series will be returned where the
7130 index is the columns of self and the values are the quantiles.
7131
7132 Examples
7133 --------
7134
7135 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
7136 columns=['a', 'b'])
7137 >>> df.quantile(.1)
7138 a 1.3
7139 b 3.7
7140 dtype: float64
7141 >>> df.quantile([.1, .5])
7142 a b
7143 0.1 1.3 3.7
7144 0.5 2.5 55.0
7145
7146 Specifying `numeric_only=False` will also compute the quantile of
7147 datetime and timedelta data.
7148
7149 >>> df = pd.DataFrame({'A': [1, 2],
7150 'B': [pd.Timestamp('2010'),
7151 pd.Timestamp('2011')],
7152 'C': [pd.Timedelta('1 days'),
7153 pd.Timedelta('2 days')]})
7154 >>> df.quantile(0.5, numeric_only=False)
7155 A 1.5
7156 B 2010-07-02 12:00:00
7157 C 1 days 12:00:00
7158 Name: 0.5, dtype: object
7159
7160 See Also
7161 --------
7162 pandas.core.window.Rolling.quantile
7163 """
7164 self._check_percentile(q)
7165
7166 data = self._get_numeric_data() if numeric_only else self
7167 axis = self._get_axis_number(axis)
7168 is_transposed = axis == 1
7169
7170 if is_transposed:
7171 data = data.T
7172
7173 result = data._data.quantile(qs=q,
7174 axis=1,
7175 interpolation=interpolation,
7176 transposed=is_transposed)
7177
7178 if result.ndim == 2:
7179 result = self._constructor(result)
7180 else:
7181 result = self._constructor_sliced(result, name=q)
7182
7183 if is_transposed:
7184 result = result.T
7185
7186 return result
7187
7188 def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
7189 """
7190 Cast to DatetimeIndex of timestamps, at *beginning* of period
7191
7192 Parameters
7193 ----------
7194 freq : string, default frequency of PeriodIndex
7195 Desired frequency
7196 how : {'s', 'e', 'start', 'end'}
7197 Convention for converting period to timestamp; start of period
7198 vs. end
7199 axis : {0 or 'index', 1 or 'columns'}, default 0
7200 The axis to convert (the index by default)
7201 copy : boolean, default True
7202 If false then underlying input data is not copied
7203
7204 Returns
7205 -------
7206 df : DataFrame with DatetimeIndex
7207 """
7208 new_data = self._data
7209 if copy:
7210 new_data = new_data.copy()
7211
7212 axis = self._get_axis_number(axis)
7213 if axis == 0:
7214 new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
7215 elif axis == 1:
7216 new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
7217 else: # pragma: no cover
7218 raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
7219 ax=axis))
7220
7221 return self._constructor(new_data)
7222
7223 def to_period(self, freq=None, axis=0, copy=True):
7224 """
7225 Convert DataFrame from DatetimeIndex to PeriodIndex with desired
7226 frequency (inferred from index if not passed)
7227
7228 Parameters
7229 ----------
7230 freq : string, default
7231 axis : {0 or 'index', 1 or 'columns'}, default 0
7232 The axis to convert (the index by default)
7233 copy : boolean, default True
7234 If False then underlying input data is not copied
7235
7236 Returns
7237 -------
7238 ts : TimeSeries with PeriodIndex
7239 """
7240 new_data = self._data
7241 if copy:
7242 new_data = new_data.copy()
7243
7244 axis = self._get_axis_number(axis)
7245 if axis == 0:
7246 new_data.set_axis(1, self.index.to_period(freq=freq))
7247 elif axis == 1:
7248 new_data.set_axis(0, self.columns.to_period(freq=freq))
7249 else: # pragma: no cover
7250 raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
7251 ax=axis))
7252
7253 return self._constructor(new_data)
7254
7255 def isin(self, values):
7256 """
7257 Return boolean DataFrame showing whether each element in the
7258 DataFrame is contained in values.
7259
7260 Parameters
7261 ----------
7262 values : iterable, Series, DataFrame or dictionary
7263 The result will only be true at a location if all the
7264 labels match. If `values` is a Series, that's the index. If
7265 `values` is a dictionary, the keys must be the column names,
7266 which must match. If `values` is a DataFrame,
7267 then both the index and column labels must match.
7268
7269 Returns
7270 -------
7271
7272 DataFrame of booleans
7273
7274 Examples
7275 --------
7276 When ``values`` is a list:
7277
7278 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
7279 >>> df.isin([1, 3, 12, 'a'])
7280 A B
7281 0 True True
7282 1 False False
7283 2 True False
7284
7285 When ``values`` is a dict:
7286
7287 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
7288 >>> df.isin({'A': [1, 3], 'B': [4, 7, 12]})
7289 A B
7290 0 True False # Note that B didn't match the 1 here.
7291 1 False True
7292 2 True True
7293
7294 When ``values`` is a Series or DataFrame:
7295
7296 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
7297 >>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
7298 >>> df.isin(other)
7299 A B
7300 0 True False
7301 1 False False # Column A in `other` has a 3, but not at index 1.
7302 2 True True
7303 """
7304 if isinstance(values, dict):
7305 from pandas.core.reshape.concat import concat
7306 values = collections.defaultdict(list, values)
7307 return concat((self.iloc[:, [i]].isin(values[col])
7308 for i, col in enumerate(self.columns)), axis=1)
7309 elif isinstance(values, Series):
7310 if not values.index.is_unique:
7311 raise ValueError("cannot compute isin with "
7312 "a duplicate axis.")
7313 return self.eq(values.reindex_like(self), axis='index')
7314 elif isinstance(values, DataFrame):
7315 if not (values.columns.is_unique and values.index.is_unique):
7316 raise ValueError("cannot compute isin with "
7317 "a duplicate axis.")
7318 return self.eq(values.reindex_like(self))
7319 else:
7320 if not is_list_like(values):
7321 raise TypeError("only list-like or dict-like objects are "
7322 "allowed to be passed to DataFrame.isin(), "
7323 "you passed a "
7324 "{0!r}".format(type(values).__name__))
7325 return DataFrame(
7326 algorithms.isin(self.values.ravel(),
7327 values).reshape(self.shape), self.index,
7328 self.columns)
7329
7330 # ----------------------------------------------------------------------
7331 # Add plotting methods to DataFrame
7332 plot = CachedAccessor("plot", gfx.FramePlotMethods)
7333 hist = gfx.hist_frame
7334 boxplot = gfx.boxplot_frame
7335
7336
7337DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
7338 axes_are_reversed=True, aliases={'rows': 0},
7339 docs={
7340 'index': 'The index (row labels) of the DataFrame.',
7341 'columns': 'The column labels of the DataFrame.'})
7342DataFrame._add_numeric_operations()
7343DataFrame._add_series_or_dataframe_operations()
7344
7345ops.add_flex_arithmetic_methods(DataFrame)
7346ops.add_special_arithmetic_methods(DataFrame)
7347
7348
7349def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
7350 """
7351 Segregate Series based on type and coerce into matrices.
7352 Needs to handle a lot of exceptional cases.
7353 """
7354 # figure out the index, if necessary
7355 if index is None:
7356 index = extract_index(arrays)
7357
7358 # don't force copy because getting jammed in an ndarray anyway
7359 arrays = _homogenize(arrays, index, dtype)
7360
7361 # from BlockManager perspective
7362 axes = [_ensure_index(columns), _ensure_index(index)]
7363
7364 return create_block_manager_from_arrays(arrays, arr_names, axes)
7365
7366
7367def extract_index(data):
7368 from pandas.core.index import _union_indexes
7369
7370 index = None
7371 if len(data) == 0:
7372 index = Index([])
7373 elif len(data) > 0:
7374 raw_lengths = []
7375 indexes = []
7376
7377 have_raw_arrays = False
7378 have_series = False
7379 have_dicts = False
7380
7381 for v in data:
7382 if isinstance(v, Series):
7383 have_series = True
7384 indexes.append(v.index)
7385 elif isinstance(v, dict):
7386 have_dicts = True
7387 indexes.append(list(v.keys()))
7388 elif is_list_like(v) and getattr(v, 'ndim', 1) == 1:
7389 have_raw_arrays = True
7390 raw_lengths.append(len(v))
7391
7392 if not indexes and not raw_lengths:
7393 raise ValueError('If using all scalar values, you must pass'
7394 ' an index')
7395
7396 if have_series or have_dicts:
7397 index = _union_indexes(indexes)
7398
7399 if have_raw_arrays:
7400 lengths = list(set(raw_lengths))
7401 if len(lengths) > 1:
7402 raise ValueError('arrays must all be same length')
7403
7404 if have_dicts:
7405 raise ValueError('Mixing dicts with non-Series may lead to '
7406 'ambiguous ordering.')
7407
7408 if have_series:
7409 if lengths[0] != len(index):
7410 msg = ('array length %d does not match index length %d' %
7411 (lengths[0], len(index)))
7412 raise ValueError(msg)
7413 else:
7414 index = com._default_index(lengths[0])
7415
7416 return _ensure_index(index)
7417
7418
7419def _prep_ndarray(values, copy=True):
7420 if not isinstance(values, (np.ndarray, Series, Index)):
7421 if len(values) == 0:
7422 return np.empty((0, 0), dtype=object)
7423
7424 def convert(v):
7425 return maybe_convert_platform(v)
7426
7427 # we could have a 1-dim or 2-dim list here
7428 # this is equiv of np.asarray, but does object conversion
7429 # and platform dtype preservation
7430 try:
7431 if is_list_like(values[0]) or hasattr(values[0], 'len'):
7432 values = np.array([convert(v) for v in values])
7433 else:
7434 values = convert(values)
7435 except:
7436 values = convert(values)
7437
7438 else:
7439
7440 # drop subclass info, do not copy data
7441 values = np.asarray(values)
7442 if copy:
7443 values = values.copy()
7444
7445 if values.ndim == 1:
7446 values = values.reshape((values.shape[0], 1))
7447 elif values.ndim != 2:
7448 raise ValueError('Must pass 2-d input')
7449
7450 return values
7451
7452
7453def _to_arrays(data, columns, coerce_float=False, dtype=None):
7454 """
7455 Return list of arrays, columns
7456 """
7457 if isinstance(data, DataFrame):
7458 if columns is not None:
7459 arrays = [data._ixs(i, axis=1).values
7460 for i, col in enumerate(data.columns) if col in columns]
7461 else:
7462 columns = data.columns
7463 arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]
7464
7465 return arrays, columns
7466
7467 if not len(data):
7468 if isinstance(data, np.ndarray):
7469 columns = data.dtype.names
7470 if columns is not None:
7471 return [[]] * len(columns), columns
7472 return [], [] # columns if columns is not None else []
7473 if isinstance(data[0], (list, tuple)):
7474 return _list_to_arrays(data, columns, coerce_float=coerce_float,
7475 dtype=dtype)
7476 elif isinstance(data[0], collections.Mapping):
7477 return _list_of_dict_to_arrays(data, columns,
7478 coerce_float=coerce_float, dtype=dtype)
7479 elif isinstance(data[0], Series):
7480 return _list_of_series_to_arrays(data, columns,
7481 coerce_float=coerce_float,
7482 dtype=dtype)
7483 elif isinstance(data[0], Categorical):
7484 if columns is None:
7485 columns = com._default_index(len(data))
7486 return data, columns
7487 elif (isinstance(data, (np.ndarray, Series, Index)) and
7488 data.dtype.names is not None):
7489
7490 columns = list(data.dtype.names)
7491 arrays = [data[k] for k in columns]
7492 return arrays, columns
7493 else:
7494 # last ditch effort
7495 data = lmap(tuple, data)
7496 return _list_to_arrays(data, columns, coerce_float=coerce_float,
7497 dtype=dtype)
7498
7499
7500def _masked_rec_array_to_mgr(data, index, columns, dtype, copy):
7501 """ extract from a masked rec array and create the manager """
7502
7503 # essentially process a record array then fill it
7504 fill_value = data.fill_value
7505 fdata = ma.getdata(data)
7506 if index is None:
7507 index = _get_names_from_index(fdata)
7508 if index is None:
7509 index = com._default_index(len(data))
7510 index = _ensure_index(index)
7511
7512 if columns is not None:
7513 columns = _ensure_index(columns)
7514 arrays, arr_columns = _to_arrays(fdata, columns)
7515
7516 # fill if needed
7517 new_arrays = []
7518 for fv, arr, col in zip(fill_value, arrays, arr_columns):
7519 mask = ma.getmaskarray(data[col])
7520 if mask.any():
7521 arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
7522 arr[mask] = fv
7523 new_arrays.append(arr)
7524
7525 # create the manager
7526 arrays, arr_columns = _reorder_arrays(new_arrays, arr_columns, columns)
7527 if columns is None:
7528 columns = arr_columns
7529
7530 mgr = _arrays_to_mgr(arrays, arr_columns, index, columns)
7531
7532 if copy:
7533 mgr = mgr.copy()
7534 return mgr
7535
7536
7537def _reorder_arrays(arrays, arr_columns, columns):
7538 # reorder according to the columns
7539 if (columns is not None and len(columns) and arr_columns is not None and
7540 len(arr_columns)):
7541 indexer = _ensure_index(arr_columns).get_indexer(columns)
7542 arr_columns = _ensure_index([arr_columns[i] for i in indexer])
7543 arrays = [arrays[i] for i in indexer]
7544 return arrays, arr_columns
7545
7546
7547def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
7548 if len(data) > 0 and isinstance(data[0], tuple):
7549 content = list(lib.to_object_array_tuples(data).T)
7550 else:
7551 # list of lists
7552 content = list(lib.to_object_array(data).T)
7553 return _convert_object_array(content, columns, dtype=dtype,
7554 coerce_float=coerce_float)
7555
7556
7557def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
7558 from pandas.core.index import _get_objs_combined_axis
7559
7560 if columns is None:
7561 columns = _get_objs_combined_axis(data, sort=False)
7562
7563 indexer_cache = {}
7564
7565 aligned_values = []
7566 for s in data:
7567 index = getattr(s, 'index', None)
7568 if index is None:
7569 index = com._default_index(len(s))
7570
7571 if id(index) in indexer_cache:
7572 indexer = indexer_cache[id(index)]
7573 else:
7574 indexer = indexer_cache[id(index)] = index.get_indexer(columns)
7575
7576 values = com._values_from_object(s)
7577 aligned_values.append(algorithms.take_1d(values, indexer))
7578
7579 values = np.vstack(aligned_values)
7580
7581 if values.dtype == np.object_:
7582 content = list(values.T)
7583 return _convert_object_array(content, columns, dtype=dtype,
7584 coerce_float=coerce_float)
7585 else:
7586 return values.T, columns
7587
7588
7589def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
7590 if columns is None:
7591 gen = (list(x.keys()) for x in data)
7592 sort = not any(isinstance(d, OrderedDict) for d in data)
7593 columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
7594
7595 # assure that they are of the base dict class and not of derived
7596 # classes
7597 data = [(type(d) is dict) and d or dict(d) for d in data]
7598
7599 content = list(lib.dicts_to_array(data, list(columns)).T)
7600 return _convert_object_array(content, columns, dtype=dtype,
7601 coerce_float=coerce_float)
7602
7603
7604def _convert_object_array(content, columns, coerce_float=False, dtype=None):
7605 if columns is None:
7606 columns = com._default_index(len(content))
7607 else:
7608 if len(columns) != len(content): # pragma: no cover
7609 # caller's responsibility to check for this...
7610 raise AssertionError('{col:d} columns passed, passed data had '
7611 '{con} columns'.format(col=len(columns),
7612 con=len(content)))
7613
7614 # provide soft conversion of object dtypes
7615 def convert(arr):
7616 if dtype != object and dtype != np.object:
7617 arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
7618 arr = maybe_cast_to_datetime(arr, dtype)
7619 return arr
7620
7621 arrays = [convert(arr) for arr in content]
7622
7623 return arrays, columns
7624
7625
7626def _get_names_from_index(data):
7627 has_some_name = any(getattr(s, 'name', None) is not None for s in data)
7628 if not has_some_name:
7629 return com._default_index(len(data))
7630
7631 index = lrange(len(data))
7632 count = 0
7633 for i, s in enumerate(data):
7634 n = getattr(s, 'name', None)
7635 if n is not None:
7636 index[i] = n
7637 else:
7638 index[i] = 'Unnamed %d' % count
7639 count += 1
7640
7641 return index
7642
7643
7644def _homogenize(data, index, dtype=None):
7645 from pandas.core.series import _sanitize_array
7646
7647 oindex = None
7648 homogenized = []
7649
7650 for v in data:
7651 if isinstance(v, Series):
7652 if dtype is not None:
7653 v = v.astype(dtype)
7654 if v.index is not index:
7655 # Forces alignment. No need to copy data since we
7656 # are putting it into an ndarray later
7657 v = v.reindex(index, copy=False)
7658 else:
7659 if isinstance(v, dict):
7660 if oindex is None:
7661 oindex = index.astype('O')
7662
7663 if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
7664 v = com._dict_compat(v)
7665 else:
7666 v = dict(v)
7667 v = lib.fast_multiget(v, oindex.values, default=np.nan)
7668 v = _sanitize_array(v, index, dtype=dtype, copy=False,
7669 raise_cast_failure=False)
7670
7671 homogenized.append(v)
7672
7673 return homogenized
7674
7675
7676def _from_nested_dict(data):
7677 # TODO: this should be seriously cythonized
7678 new_data = OrderedDict()
7679 for index, s in compat.iteritems(data):
7680 for col, v in compat.iteritems(s):
7681 new_data[col] = new_data.get(col, OrderedDict())
7682 new_data[col][index] = v
7683 return new_data
7684
7685
7686def _put_str(s, space):
7687 return u'{s}'.format(s=s)[:space].ljust(space)