mnmFWwQC

· 6 years ago · Oct 15, 2019, 04:12 PM
1"""
2DataFrame
3---------
4An efficient 2D container for potentially mixed-type time series or other
5labeled data series.
6
7Similar to its R counterpart, data.frame, except providing automatic data
8alignment and a host of useful data manipulation methods having to do with the
9labeling information
10"""
11from __future__ import division
12# pylint: disable=E1101,E1103
13# pylint: disable=W0212,W0231,W0703,W0622
14
15import functools
16import collections
17import itertools
18import sys
19import types
20import warnings
21from textwrap import dedent
22
23import numpy as np
24import numpy.ma as ma
25
26from pandas.core.accessor import CachedAccessor
27from pandas.core.dtypes.cast import (
28    maybe_upcast,
29    cast_scalar_to_array,
30    construct_1d_arraylike_from_scalar,
31    maybe_cast_to_datetime,
32    maybe_infer_to_datetimelike,
33    maybe_convert_platform,
34    maybe_downcast_to_dtype,
35    invalidate_string_dtypes,
36    coerce_to_dtypes,
37    maybe_upcast_putmask,
38    find_common_type)
39from pandas.core.dtypes.common import (
40    is_categorical_dtype,
41    is_object_dtype,
42    is_extension_type,
43    is_extension_array_dtype,
44    is_datetimetz,
45    is_datetime64_any_dtype,
46    is_bool_dtype,
47    is_integer_dtype,
48    is_float_dtype,
49    is_integer,
50    is_scalar,
51    is_dtype_equal,
52    needs_i8_conversion,
53    _get_dtype_from_object,
54    _ensure_float64,
55    _ensure_int64,
56    _ensure_platform_int,
57    is_list_like,
58    is_nested_list_like,
59    is_iterator,
60    is_sequence,
61    is_named_tuple)
62from pandas.core.dtypes.concat import _get_sliced_frame_result_type
63from pandas.core.dtypes.missing import isna, notna
64
65
66from pandas.core.generic import NDFrame, _shared_docs
67from pandas.core.index import (Index, MultiIndex, _ensure_index,
68                               _ensure_index_from_sequences)
69from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable,
70                                  check_bool_indexer)
71from pandas.core.internals import (BlockManager,
72                                   create_block_manager_from_arrays,
73                                   create_block_manager_from_blocks)
74from pandas.core.series import Series
75from pandas.core.arrays import Categorical, ExtensionArray
76import pandas.core.algorithms as algorithms
77from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
78                           OrderedDict, raise_with_traceback)
79from pandas import compat
80from pandas.compat import PY36
81from pandas.compat.numpy import function as nv
82from pandas.util._decorators import (Appender, Substitution,
83                                     rewrite_axis_style_signature)
84from pandas.util._validators import (validate_bool_kwarg,
85                                     validate_axis_style_args)
86
87from pandas.core.indexes.period import PeriodIndex
88from pandas.core.indexes.datetimes import DatetimeIndex
89from pandas.core.indexes.timedeltas import TimedeltaIndex
90
91import pandas.core.common as com
92import pandas.core.nanops as nanops
93import pandas.core.ops as ops
94import pandas.io.formats.console as console
95import pandas.io.formats.format as fmt
96from pandas.io.formats.printing import pprint_thing
97import pandas.plotting._core as gfx
98
99from pandas._libs import lib, algos as libalgos
100
101from pandas.core.config import get_option
102
103# ---------------------------------------------------------------------
104# Docstring templates
105
106_shared_doc_kwargs = dict(
107    axes='index, columns', klass='DataFrame',
108    axes_single_arg="{0 or 'index', 1 or 'columns'}",
109    axis="""
110    axis : {0 or 'index', 1 or 'columns'}, default 0
111        - 0 or 'index': apply function to each column.
112        - 1 or 'columns': apply function to each row.""",
113    optional_by="""
114        by : str or list of str
115            Name or list of names to sort by.
116
117            - if `axis` is 0 or `'index'` then `by` may contain index
118              levels and/or column labels
119            - if `axis` is 1 or `'columns'` then `by` may contain column
120              levels and/or index labels
121
122            .. versionchanged:: 0.23.0
123               Allow specifying index or column level names.""",
124    versionadded_to_excel='',
125    optional_labels="""labels : array-like, optional
126            New labels / index to conform the axis specified by 'axis' to.""",
127    optional_axis="""axis : int or str, optional
128            Axis to target. Can be either the axis name ('index', 'columns')
129            or number (0, 1).""",
130)
131
132_numeric_only_doc = """numeric_only : boolean, default None
133    Include only float, int, boolean data. If None, will attempt to use
134    everything, then use only numeric data
135"""
136
137_merge_doc = """
138Merge DataFrame objects by performing a database-style join operation by
139columns or indexes.
140
141If joining columns on columns, the DataFrame indexes *will be
142ignored*. Otherwise if joining indexes on indexes or indexes on a column or
143columns, the index will be passed on.
144
145Parameters
146----------%s
147right : DataFrame
148how : {'left', 'right', 'outer', 'inner'}, default 'inner'
149    * left: use only keys from left frame, similar to a SQL left outer join;
150      preserve key order
151    * right: use only keys from right frame, similar to a SQL right outer join;
152      preserve key order
153    * outer: use union of keys from both frames, similar to a SQL full outer
154      join; sort keys lexicographically
155    * inner: use intersection of keys from both frames, similar to a SQL inner
156      join; preserve the order of the left keys
157on : label or list
158    Column or index level names to join on. These must be found in both
159    DataFrames. If `on` is None and not merging on indexes then this defaults
160    to the intersection of the columns in both DataFrames.
161left_on : label or list, or array-like
162    Column or index level names to join on in the left DataFrame. Can also
163    be an array or list of arrays of the length of the left DataFrame.
164    These arrays are treated as if they are columns.
165right_on : label or list, or array-like
166    Column or index level names to join on in the right DataFrame. Can also
167    be an array or list of arrays of the length of the right DataFrame.
168    These arrays are treated as if they are columns.
169left_index : boolean, default False
170    Use the index from the left DataFrame as the join key(s). If it is a
171    MultiIndex, the number of keys in the other DataFrame (either the index
172    or a number of columns) must match the number of levels
173right_index : boolean, default False
174    Use the index from the right DataFrame as the join key. Same caveats as
175    left_index
176sort : boolean, default False
177    Sort the join keys lexicographically in the result DataFrame. If False,
178    the order of the join keys depends on the join type (how keyword)
179suffixes : 2-length sequence (tuple, list, ...)
180    Suffix to apply to overlapping column names in the left and right
181    side, respectively
182copy : boolean, default True
183    If False, do not copy data unnecessarily
184indicator : boolean or string, default False
185    If True, adds a column to output DataFrame called "_merge" with
186    information on the source of each row.
187    If string, column with information on source of each row will be added to
188    output DataFrame, and column will be named value of string.
189    Information column is Categorical-type and takes on a value of "left_only"
190    for observations whose merge key only appears in 'left' DataFrame,
191    "right_only" for observations whose merge key only appears in 'right'
192    DataFrame, and "both" if the observation's merge key is found in both.
193
194validate : string, default None
195    If specified, checks if merge is of specified type.
196
197    * "one_to_one" or "1:1": check if merge keys are unique in both
198      left and right datasets.
199    * "one_to_many" or "1:m": check if merge keys are unique in left
200      dataset.
201    * "many_to_one" or "m:1": check if merge keys are unique in right
202      dataset.
203    * "many_to_many" or "m:m": allowed, but does not result in checks.
204
205    .. versionadded:: 0.21.0
206
207Notes
208-----
209Support for specifying index levels as the `on`, `left_on`, and
210`right_on` parameters was added in version 0.23.0
211
212Examples
213--------
214
215>>> A              >>> B
216    lkey value         rkey value
2170   foo  1         0   foo  5
2181   bar  2         1   bar  6
2192   baz  3         2   qux  7
2203   foo  4         3   bar  8
221
222>>> A.merge(B, left_on='lkey', right_on='rkey', how='outer')
223   lkey  value_x  rkey  value_y
2240  foo   1        foo   5
2251  foo   4        foo   5
2262  bar   2        bar   6
2273  bar   2        bar   8
2284  baz   3        NaN   NaN
2295  NaN   NaN      qux   7
230
231Returns
232-------
233merged : DataFrame
234    The output type will the be same as 'left', if it is a subclass
235    of DataFrame.
236
237See also
238--------
239merge_ordered
240merge_asof
241DataFrame.join
242"""
243
244# -----------------------------------------------------------------------
245# DataFrame class
246
247
248class DataFrame(NDFrame):
249    """ Two-dimensional size-mutable, potentially heterogeneous tabular data
250    structure with labeled axes (rows and columns). Arithmetic operations
251    align on both row and column labels. Can be thought of as a dict-like
252    container for Series objects. The primary pandas data structure.
253
254    Parameters
255    ----------
256    data : numpy ndarray (structured or homogeneous), dict, or DataFrame
257        Dict can contain Series, arrays, constants, or list-like objects
258
259        .. versionchanged :: 0.23.0
260           If data is a dict, argument order is maintained for Python 3.6
261           and later.
262
263    index : Index or array-like
264        Index to use for resulting frame. Will default to RangeIndex if
265        no indexing information part of input data and no index provided
266    columns : Index or array-like
267        Column labels to use for resulting frame. Will default to
268        RangeIndex (0, 1, 2, ..., n) if no column labels are provided
269    dtype : dtype, default None
270        Data type to force. Only a single dtype is allowed. If None, infer
271    copy : boolean, default False
272        Copy data from inputs. Only affects DataFrame / 2d ndarray input
273
274    Examples
275    --------
276    Constructing DataFrame from a dictionary.
277
278    >>> d = {'col1': [1, 2], 'col2': [3, 4]}
279    >>> df = pd.DataFrame(data=d)
280    >>> df
281       col1  col2
282    0     1     3
283    1     2     4
284
285    Notice that the inferred dtype is int64.
286
287    >>> df.dtypes
288    col1    int64
289    col2    int64
290    dtype: object
291
292    To enforce a single dtype:
293
294    >>> df = pd.DataFrame(data=d, dtype=np.int8)
295    >>> df.dtypes
296    col1    int8
297    col2    int8
298    dtype: object
299
300    Constructing DataFrame from numpy ndarray:
301
302    >>> df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),
303    ...                    columns=['a', 'b', 'c', 'd', 'e'])
304    >>> df2
305        a   b   c   d   e
306    0   2   8   8   3   4
307    1   4   2   9   0   9
308    2   1   0   7   8   0
309    3   5   1   7   1   3
310    4   6   0   2   4   2
311
312    See also
313    --------
314    DataFrame.from_records : constructor from tuples, also record arrays
315    DataFrame.from_dict : from dicts of Series, arrays, or dicts
316    DataFrame.from_items : from sequence of (key, value) pairs
317    pandas.read_csv, pandas.read_table, pandas.read_clipboard
318    """
319
320    @property
321    def _constructor(self):
322        return DataFrame
323
324    _constructor_sliced = Series
325    _deprecations = NDFrame._deprecations | frozenset(
326        ['sortlevel', 'get_value', 'set_value', 'from_csv', 'from_items'])
327    _accessors = set()
328
329    @property
330    def _constructor_expanddim(self):
331        from pandas.core.panel import Panel
332        return Panel
333
334    def __init__(self, data=None, index=None, columns=None, dtype=None,
335                 copy=False):
336        if data is None:
337            data = {}
338        if dtype is not None:
339            dtype = self._validate_dtype(dtype)
340
341        if isinstance(data, DataFrame):
342            data = data._data
343
344        if isinstance(data, BlockManager):
345            mgr = self._init_mgr(data, axes=dict(index=index, columns=columns),
346                                 dtype=dtype, copy=copy)
347        elif isinstance(data, dict):
348            mgr = self._init_dict(data, index, columns, dtype=dtype)
349        elif isinstance(data, ma.MaskedArray):
350            import numpy.ma.mrecords as mrecords
351            # masked recarray
352            if isinstance(data, mrecords.MaskedRecords):
353                mgr = _masked_rec_array_to_mgr(data, index, columns, dtype,
354                                               copy)
355
356            # a masked array
357            else:
358                mask = ma.getmaskarray(data)
359                if mask.any():
360                    data, fill_value = maybe_upcast(data, copy=True)
361                    data[mask] = fill_value
362                else:
363                    data = data.copy()
364                mgr = self._init_ndarray(data, index, columns, dtype=dtype,
365                                         copy=copy)
366
367        elif isinstance(data, (np.ndarray, Series, Index)):
368            if data.dtype.names:
369                data_columns = list(data.dtype.names)
370                data = {k: data[k] for k in data_columns}
371                if columns is None:
372                    columns = data_columns
373                mgr = self._init_dict(data, index, columns, dtype=dtype)
374            elif getattr(data, 'name', None) is not None:
375                mgr = self._init_dict({data.name: data}, index, columns,
376                                      dtype=dtype)
377            else:
378                mgr = self._init_ndarray(data, index, columns, dtype=dtype,
379                                         copy=copy)
380        elif isinstance(data, (list, types.GeneratorType)):
381            if isinstance(data, types.GeneratorType):
382                data = list(data)
383            if len(data) > 0:
384                if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1:
385                    if is_named_tuple(data[0]) and columns is None:
386                        columns = data[0]._fields
387                    arrays, columns = _to_arrays(data, columns, dtype=dtype)
388                    columns = _ensure_index(columns)
389
390                    # set the index
391                    if index is None:
392                        if isinstance(data[0], Series):
393                            index = _get_names_from_index(data)
394                        elif isinstance(data[0], Categorical):
395                            index = com._default_index(len(data[0]))
396                        else:
397                            index = com._default_index(len(data))
398
399                    mgr = _arrays_to_mgr(arrays, columns, index, columns,
400                                         dtype=dtype)
401                else:
402                    mgr = self._init_ndarray(data, index, columns, dtype=dtype,
403                                             copy=copy)
404            else:
405                mgr = self._init_dict({}, index, columns, dtype=dtype)
406        elif isinstance(data, collections.Iterator):
407            raise TypeError("data argument can't be an iterator")
408        else:
409            try:
410                arr = np.array(data, dtype=dtype, copy=copy)
411            except (ValueError, TypeError) as e:
412                exc = TypeError('DataFrame constructor called with '
413                                'incompatible data and dtype: {e}'.format(e=e))
414                raise_with_traceback(exc)
415
416            if arr.ndim == 0 and index is not None and columns is not None:
417                values = cast_scalar_to_array((len(index), len(columns)),
418                                              data, dtype=dtype)
419                mgr = self._init_ndarray(values, index, columns,
420                                         dtype=values.dtype, copy=False)
421            else:
422                raise ValueError('DataFrame constructor not properly called!')
423
424        NDFrame.__init__(self, mgr, fastpath=True)
425
426    def _init_dict(self, data, index, columns, dtype=None):
427        """
428        Segregate Series based on type and coerce into matrices.
429        Needs to handle a lot of exceptional cases.
430        """
431        if columns is not None:
432            arrays = Series(data, index=columns, dtype=object)
433            data_names = arrays.index
434
435            missing = arrays.isnull()
436            if index is None:
437                # GH10856
438                # raise ValueError if only scalars in dict
439                index = extract_index(arrays[~missing])
440            else:
441                index = _ensure_index(index)
442
443            # no obvious "empty" int column
444            if missing.any() and not is_integer_dtype(dtype):
445                if dtype is None or np.issubdtype(dtype, np.flexible):
446                    # 1783
447                    nan_dtype = object
448                else:
449                    nan_dtype = dtype
450                v = construct_1d_arraylike_from_scalar(np.nan, len(index),
451                                                       nan_dtype)
452                arrays.loc[missing] = [v] * missing.sum()
453
454        else:
455            keys = com._dict_keys_to_ordered_list(data)
456            columns = data_names = Index(keys)
457            arrays = [data[k] for k in keys]
458
459        return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
460
461    def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
462        # input must be a ndarray, list, Series, index
463
464        if isinstance(values, Series):
465            if columns is None:
466                if values.name is not None:
467                    columns = [values.name]
468            if index is None:
469                index = values.index
470            else:
471                values = values.reindex(index)
472
473            # zero len case (GH #2234)
474            if not len(values) and columns is not None and len(columns):
475                values = np.empty((0, 1), dtype=object)
476
477        # helper to create the axes as indexes
478        def _get_axes(N, K, index=index, columns=columns):
479            # return axes or defaults
480
481            if index is None:
482                index = com._default_index(N)
483            else:
484                index = _ensure_index(index)
485
486            if columns is None:
487                columns = com._default_index(K)
488            else:
489                columns = _ensure_index(columns)
490            return index, columns
491
492        # we could have a categorical type passed or coerced to 'category'
493        # recast this to an _arrays_to_mgr
494        if (is_categorical_dtype(getattr(values, 'dtype', None)) or
495                is_categorical_dtype(dtype)):
496
497            if not hasattr(values, 'dtype'):
498                values = _prep_ndarray(values, copy=copy)
499                values = values.ravel()
500            elif copy:
501                values = values.copy()
502
503            index, columns = _get_axes(len(values), 1)
504            return _arrays_to_mgr([values], columns, index, columns,
505                                  dtype=dtype)
506        elif (is_datetimetz(values) or is_extension_array_dtype(values)):
507            # GH19157
508            if columns is None:
509                columns = [0]
510            return _arrays_to_mgr([values], columns, index, columns,
511                                  dtype=dtype)
512
513        # by definition an array here
514        # the dtypes will be coerced to a single dtype
515        values = _prep_ndarray(values, copy=copy)
516
517        if dtype is not None:
518            if not is_dtype_equal(values.dtype, dtype):
519                try:
520                    values = values.astype(dtype)
521                except Exception as orig:
522                    e = ValueError("failed to cast to '{dtype}' (Exception "
523                                   "was: {orig})".format(dtype=dtype,
524                                                         orig=orig))
525                    raise_with_traceback(e)
526
527        index, columns = _get_axes(*values.shape)
528        values = values.T
529
530        # if we don't have a dtype specified, then try to convert objects
531        # on the entire block; this is to convert if we have datetimelike's
532        # embedded in an object type
533        if dtype is None and is_object_dtype(values):
534            values = maybe_infer_to_datetimelike(values)
535
536        return create_block_manager_from_blocks([values], [columns, index])
537
538    @property
539    def axes(self):
540        """
541        Return a list representing the axes of the DataFrame.
542
543        It has the row axis labels and column axis labels as the only members.
544        They are returned in that order.
545
546        Examples
547        --------
548        >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
549        >>> df.axes
550        [RangeIndex(start=0, stop=2, step=1), Index(['coll', 'col2'],
551        dtype='object')]
552        """
553        return [self.index, self.columns]
554
555    @property
556    def shape(self):
557        """
558        Return a tuple representing the dimensionality of the DataFrame.
559
560        See Also
561        --------
562        ndarray.shape
563
564        Examples
565        --------
566        >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
567        >>> df.shape
568        (2, 2)
569
570        >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
571        ...                    'col3': [5, 6]})
572        >>> df.shape
573        (2, 3)
574        """
575        return len(self.index), len(self.columns)
576
577    def _repr_fits_vertical_(self):
578        """
579        Check length against max_rows.
580        """
581        max_rows = get_option("display.max_rows")
582        return len(self) <= max_rows
583
584    def _repr_fits_horizontal_(self, ignore_width=False):
585        """
586        Check if full repr fits in horizontal boundaries imposed by the display
587        options width and max_columns. In case off non-interactive session, no
588        boundaries apply.
589
590        ignore_width is here so ipnb+HTML output can behave the way
591        users expect. display.max_columns remains in effect.
592        GH3541, GH3573
593        """
594
595        width, height = console.get_console_size()
596        max_columns = get_option("display.max_columns")
597        nb_columns = len(self.columns)
598
599        # exceed max columns
600        if ((max_columns and nb_columns > max_columns) or
601                ((not ignore_width) and width and nb_columns > (width // 2))):
602            return False
603
604        # used by repr_html under IPython notebook or scripts ignore terminal
605        # dims
606        if ignore_width or not com.in_interactive_session():
607            return True
608
609        if (get_option('display.width') is not None or
610                com.in_ipython_frontend()):
611            # check at least the column row for excessive width
612            max_rows = 1
613        else:
614            max_rows = get_option("display.max_rows")
615
616        # when auto-detecting, so width=None and not in ipython front end
617        # check whether repr fits horizontal by actually checking
618        # the width of the rendered repr
619        buf = StringIO()
620
621        # only care about the stuff we'll actually print out
622        # and to_string on entire frame may be expensive
623        d = self
624
625        if not (max_rows is None):  # unlimited rows
626            # min of two, where one may be None
627            d = d.iloc[:min(max_rows, len(d))]
628        else:
629            return True
630
631        d.to_string(buf=buf)
632        value = buf.getvalue()
633        repr_width = max(len(l) for l in value.split('\n'))
634
635        return repr_width < width
636
637    def _info_repr(self):
638        """True if the repr should show the info view."""
639        info_repr_option = (get_option("display.large_repr") == "info")
640        return info_repr_option and not (self._repr_fits_horizontal_() and
641                                         self._repr_fits_vertical_())
642
643    def __unicode__(self):
644        """
645        Return a string representation for a particular DataFrame
646
647        Invoked by unicode(df) in py2 only. Yields a Unicode String in both
648        py2/py3.
649        """
650        buf = StringIO(u(""))
651        if self._info_repr():
652            self.info(buf=buf)
653            return buf.getvalue()
654
655        max_rows = get_option("display.max_rows")
656        max_cols = get_option("display.max_columns")
657        show_dimensions = get_option("display.show_dimensions")
658        if get_option("display.expand_frame_repr"):
659            width, _ = console.get_console_size()
660        else:
661            width = None
662        self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
663                       line_width=width, show_dimensions=show_dimensions)
664
665        return buf.getvalue()
666
667    def _repr_html_(self):
668        """
669        Return a html representation for a particular DataFrame.
670        Mainly for IPython notebook.
671        """
672        # qtconsole doesn't report its line width, and also
673        # behaves badly when outputting an HTML table
674        # that doesn't fit the window, so disable it.
675        # XXX: In IPython 3.x and above, the Qt console will not attempt to
676        # display HTML, so this check can be removed when support for
677        # IPython 2.x is no longer needed.
678        if com.in_qtconsole():
679            # 'HTML output is disabled in QtConsole'
680            return None
681
682        if self._info_repr():
683            buf = StringIO(u(""))
684            self.info(buf=buf)
685            # need to escape the <class>, should be the first line.
686            val = buf.getvalue().replace('<', r'&lt;', 1)
687            val = val.replace('>', r'&gt;', 1)
688            return '<pre>' + val + '</pre>'
689
690        if get_option("display.notebook_repr_html"):
691            max_rows = get_option("display.max_rows")
692            max_cols = get_option("display.max_columns")
693            show_dimensions = get_option("display.show_dimensions")
694
695            return self.to_html(max_rows=max_rows, max_cols=max_cols,
696                                show_dimensions=show_dimensions, notebook=True)
697        else:
698            return None
699
700    @property
701    def style(self):
702        """
703        Property returning a Styler object containing methods for
704        building a styled HTML representation fo the DataFrame.
705
706        See Also
707        --------
708        pandas.io.formats.style.Styler
709        """
710        from pandas.io.formats.style import Styler
711        return Styler(self)
712
713    def iteritems(self):
714        """
715        Iterator over (column name, Series) pairs.
716
717        See also
718        --------
719        iterrows : Iterate over DataFrame rows as (index, Series) pairs.
720        itertuples : Iterate over DataFrame rows as namedtuples of the values.
721
722        """
723        if self.columns.is_unique and hasattr(self, '_item_cache'):
724            for k in self.columns:
725                yield k, self._get_item_cache(k)
726        else:
727            for i, k in enumerate(self.columns):
728                yield k, self._ixs(i, axis=1)
729
730    def iterrows(self):
731        """
732        Iterate over DataFrame rows as (index, Series) pairs.
733
734        Notes
735        -----
736
737        1. Because ``iterrows`` returns a Series for each row,
738           it does **not** preserve dtypes across the rows (dtypes are
739           preserved across columns for DataFrames). For example,
740
741           >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
742           >>> row = next(df.iterrows())[1]
743           >>> row
744           int      1.0
745           float    1.5
746           Name: 0, dtype: float64
747           >>> print(row['int'].dtype)
748           float64
749           >>> print(df['int'].dtype)
750           int64
751
752           To preserve dtypes while iterating over the rows, it is better
753           to use :meth:`itertuples` which returns namedtuples of the values
754           and which is generally faster than ``iterrows``.
755
756        2. You should **never modify** something you are iterating over.
757           This is not guaranteed to work in all cases. Depending on the
758           data types, the iterator returns a copy and not a view, and writing
759           to it will have no effect.
760
761        Returns
762        -------
763        it : generator
764            A generator that iterates over the rows of the frame.
765
766        See also
767        --------
768        itertuples : Iterate over DataFrame rows as namedtuples of the values.
769        iteritems : Iterate over (column name, Series) pairs.
770
771        """
772        columns = self.columns
773        klass = self._constructor_sliced
774        for k, v in zip(self.index, self.values):
775            s = klass(v, index=columns, name=k)
776            yield k, s
777
778    def itertuples(self, index=True, name="Pandas"):
779        """
780        Iterate over DataFrame rows as namedtuples, with index value as first
781        element of the tuple.
782
783        Parameters
784        ----------
785        index : boolean, default True
786            If True, return the index as the first element of the tuple.
787        name : string, default "Pandas"
788            The name of the returned namedtuples or None to return regular
789            tuples.
790
791        Notes
792        -----
793        The column names will be renamed to positional names if they are
794        invalid Python identifiers, repeated, or start with an underscore.
795        With a large number of columns (>255), regular tuples are returned.
796
797        See also
798        --------
799        iterrows : Iterate over DataFrame rows as (index, Series) pairs.
800        iteritems : Iterate over (column name, Series) pairs.
801
802        Examples
803        --------
804
805        >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2]},
806                              index=['a', 'b'])
807        >>> df
808           col1  col2
809        a     1   0.1
810        b     2   0.2
811        >>> for row in df.itertuples():
812        ...     print(row)
813        ...
814        Pandas(Index='a', col1=1, col2=0.10000000000000001)
815        Pandas(Index='b', col1=2, col2=0.20000000000000001)
816
817        """
818        arrays = []
819        fields = []
820        if index:
821            arrays.append(self.index)
822            fields.append("Index")
823
824        # use integer indexing because of possible duplicate column names
825        arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
826
827        # Python 3 supports at most 255 arguments to constructor, and
828        # things get slow with this many fields in Python 2
829        if name is not None and len(self.columns) + index < 256:
830            # `rename` is unsupported in Python 2.6
831            try:
832                itertuple = collections.namedtuple(name,
833                                                   fields + list(self.columns),
834                                                   rename=True)
835                return map(itertuple._make, zip(*arrays))
836            except Exception:
837                pass
838
839        # fallback to regular tuples
840        return zip(*arrays)
841
842    items = iteritems
843
844    def __len__(self):
845        """Returns length of info axis, but here we use the index """
846        return len(self.index)
847
848    def dot(self, other):
849        """
850        Matrix multiplication with DataFrame or Series objects.  Can also be
851        called using `self @ other` in Python >= 3.5.
852
853        Parameters
854        ----------
855        other : DataFrame or Series
856
857        Returns
858        -------
859        dot_product : DataFrame or Series
860        """
861        if isinstance(other, (Series, DataFrame)):
862            common = self.columns.union(other.index)
863            if (len(common) > len(self.columns) or
864                    len(common) > len(other.index)):
865                raise ValueError('matrices are not aligned')
866
867            left = self.reindex(columns=common, copy=False)
868            right = other.reindex(index=common, copy=False)
869            lvals = left.values
870            rvals = right.values
871        else:
872            left = self
873            lvals = self.values
874            rvals = np.asarray(other)
875            if lvals.shape[1] != rvals.shape[0]:
876                raise ValueError('Dot product shape mismatch, '
877                                 '{l} vs {r}'.format(l=lvals.shape,
878                                                     r=rvals.shape))
879
880        if isinstance(other, DataFrame):
881            return self._constructor(np.dot(lvals, rvals), index=left.index,
882                                     columns=other.columns)
883        elif isinstance(other, Series):
884            return Series(np.dot(lvals, rvals), index=left.index)
885        elif isinstance(rvals, (np.ndarray, Index)):
886            result = np.dot(lvals, rvals)
887            if result.ndim == 2:
888                return self._constructor(result, index=left.index)
889            else:
890                return Series(result, index=left.index)
891        else:  # pragma: no cover
892            raise TypeError('unsupported type: {oth}'.format(oth=type(other)))
893
894    def __matmul__(self, other):
895        """ Matrix multiplication using binary `@` operator in Python>=3.5 """
896        return self.dot(other)
897
898    def __rmatmul__(self, other):
899        """ Matrix multiplication using binary `@` operator in Python>=3.5 """
900        return self.T.dot(np.transpose(other)).T
901
902    # ----------------------------------------------------------------------
903    # IO methods (to / from other formats)
904
905    @classmethod
906    def from_dict(cls, data, orient='columns', dtype=None, columns=None):
907        """
908        Construct DataFrame from dict of array-like or dicts.
909
910        Creates DataFrame object from dictionary by columns or by index
911        allowing dtype specification.
912
913        Parameters
914        ----------
915        data : dict
916            Of the form {field : array-like} or {field : dict}.
917        orient : {'columns', 'index'}, default 'columns'
918            The "orientation" of the data. If the keys of the passed dict
919            should be the columns of the resulting DataFrame, pass 'columns'
920            (default). Otherwise if the keys should be rows, pass 'index'.
921        dtype : dtype, default None
922            Data type to force, otherwise infer.
923        columns : list, default None
924            Column labels to use when ``orient='index'``. Raises a ValueError
925            if used with ``orient='columns'``.
926
927            .. versionadded:: 0.23.0
928
929        Returns
930        -------
931        pandas.DataFrame
932
933        See Also
934        --------
935        DataFrame.from_records : DataFrame from ndarray (structured
936            dtype), list of tuples, dict, or DataFrame
937        DataFrame : DataFrame object creation using constructor
938
939        Examples
940        --------
941        By default the keys of the dict become the DataFrame columns:
942
943        >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
944        >>> pd.DataFrame.from_dict(data)
945           col_1 col_2
946        0      3     a
947        1      2     b
948        2      1     c
949        3      0     d
950
951        Specify ``orient='index'`` to create the DataFrame using dictionary
952        keys as rows:
953
954        >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
955        >>> pd.DataFrame.from_dict(data, orient='index')
956               0  1  2  3
957        row_1  3  2  1  0
958        row_2  a  b  c  d
959
960        When using the 'index' orientation, the column names can be
961        specified manually:
962
963        >>> pd.DataFrame.from_dict(data, orient='index',
964        ...                        columns=['A', 'B', 'C', 'D'])
965               A  B  C  D
966        row_1  3  2  1  0
967        row_2  a  b  c  d
968        """
969        index = None
970        orient = orient.lower()
971        if orient == 'index':
972            if len(data) > 0:
973                # TODO speed up Series case
974                if isinstance(list(data.values())[0], (Series, dict)):
975                    data = _from_nested_dict(data)
976                else:
977                    data, index = list(data.values()), list(data.keys())
978        elif orient == 'columns':
979            if columns is not None:
980                raise ValueError("cannot use columns parameter with "
981                                 "orient='columns'")
982        else:  # pragma: no cover
983            raise ValueError('only recognize index or columns for orient')
984
985        return cls(data, index=index, columns=columns, dtype=dtype)
986
987    def to_dict(self, orient='dict', into=dict):
988        """
989        Convert the DataFrame to a dictionary.
990
991        The type of the key-value pairs can be customized with the parameters
992        (see below).
993
994        Parameters
995        ----------
996        orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
997            Determines the type of the values of the dictionary.
998
999            - 'dict' (default) : dict like {column -> {index -> value}}
1000            - 'list' : dict like {column -> [values]}
1001            - 'series' : dict like {column -> Series(values)}
1002            - 'split' : dict like
1003              {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
1004            - 'records' : list like
1005              [{column -> value}, ... , {column -> value}]
1006            - 'index' : dict like {index -> {column -> value}}
1007
1008            Abbreviations are allowed. `s` indicates `series` and `sp`
1009            indicates `split`.
1010
1011        into : class, default dict
1012            The collections.Mapping subclass used for all Mappings
1013            in the return value.  Can be the actual class or an empty
1014            instance of the mapping type you want.  If you want a
1015            collections.defaultdict, you must pass it initialized.
1016
1017            .. versionadded:: 0.21.0
1018
1019        Returns
1020        -------
1021        result : collections.Mapping like {column -> {index -> value}}
1022
1023        See Also
1024        --------
1025        DataFrame.from_dict: create a DataFrame from a dictionary
1026        DataFrame.to_json: convert a DataFrame to JSON format
1027
1028        Examples
1029        --------
1030        >>> df = pd.DataFrame({'col1': [1, 2],
1031        ...                    'col2': [0.5, 0.75]},
1032        ...                   index=['a', 'b'])
1033        >>> df
1034           col1  col2
1035        a     1   0.50
1036        b     2   0.75
1037        >>> df.to_dict()
1038        {'col1': {'a': 1, 'b': 2}, 'col2': {'a': 0.5, 'b': 0.75}}
1039
1040        You can specify the return orientation.
1041
1042        >>> df.to_dict('series')
1043        {'col1': a    1
1044                 b    2
1045                 Name: col1, dtype: int64,
1046         'col2': a    0.50
1047                 b    0.75
1048                 Name: col2, dtype: float64}
1049
1050        >>> df.to_dict('split')
1051        {'index': ['a', 'b'], 'columns': ['col1', 'col2'],
1052         'data': [[1.0, 0.5], [2.0, 0.75]]}
1053
1054        >>> df.to_dict('records')
1055        [{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}]
1056
1057        >>> df.to_dict('index')
1058        {'a': {'col1': 1.0, 'col2': 0.5}, 'b': {'col1': 2.0, 'col2': 0.75}}
1059
1060        You can also specify the mapping type.
1061
1062        >>> from collections import OrderedDict, defaultdict
1063        >>> df.to_dict(into=OrderedDict)
1064        OrderedDict([('col1', OrderedDict([('a', 1), ('b', 2)])),
1065                     ('col2', OrderedDict([('a', 0.5), ('b', 0.75)]))])
1066
1067        If you want a `defaultdict`, you need to initialize it:
1068
1069        >>> dd = defaultdict(list)
1070        >>> df.to_dict('records', into=dd)
1071        [defaultdict(<class 'list'>, {'col1': 1.0, 'col2': 0.5}),
1072         defaultdict(<class 'list'>, {'col1': 2.0, 'col2': 0.75})]
1073        """
1074        if not self.columns.is_unique:
1075            warnings.warn("DataFrame columns are not unique, some "
1076                          "columns will be omitted.", UserWarning,
1077                          stacklevel=2)
1078        # GH16122
1079        into_c = com.standardize_mapping(into)
1080        if orient.lower().startswith('d'):
1081            return into_c(
1082                (k, v.to_dict(into)) for k, v in compat.iteritems(self))
1083        elif orient.lower().startswith('l'):
1084            return into_c((k, v.tolist()) for k, v in compat.iteritems(self))
1085        elif orient.lower().startswith('sp'):
1086            return into_c((('index', self.index.tolist()),
1087                           ('columns', self.columns.tolist()),
1088                           ('data', lib.map_infer(self.values.ravel(),
1089                                                  com._maybe_box_datetimelike)
1090                            .reshape(self.values.shape).tolist())))
1091        elif orient.lower().startswith('s'):
1092            return into_c((k, com._maybe_box_datetimelike(v))
1093                          for k, v in compat.iteritems(self))
1094        elif orient.lower().startswith('r'):
1095            return [into_c((k, com._maybe_box_datetimelike(v))
1096                           for k, v in zip(self.columns, np.atleast_1d(row)))
1097                    for row in self.values]
1098        elif orient.lower().startswith('i'):
1099            return into_c((t[0], dict(zip(self.columns, t[1:])))
1100                          for t in self.itertuples())
1101        else:
1102            raise ValueError("orient '{o}' not understood".format(o=orient))
1103
1104    def to_gbq(self, destination_table, project_id, chunksize=None,
1105               verbose=None, reauth=False, if_exists='fail', private_key=None,
1106               auth_local_webserver=False, table_schema=None):
1107        """
1108        Write a DataFrame to a Google BigQuery table.
1109
1110        This function requires the `pandas-gbq package
1111        <https://pandas-gbq.readthedocs.io>`__.
1112
1113        Authentication to the Google BigQuery service is via OAuth 2.0.
1114
1115        - If ``private_key`` is provided, the library loads the JSON service
1116          account credentials and uses those to authenticate.
1117
1118        - If no ``private_key`` is provided, the library tries `application
1119          default credentials`_.
1120
1121          .. _application default credentials:
1122              https://cloud.google.com/docs/authentication/production#providing_credentials_to_your_application
1123
1124        - If application default credentials are not found or cannot be used
1125          with BigQuery, the library authenticates with user account
1126          credentials. In this case, you will be asked to grant permissions
1127          for product name 'pandas GBQ'.
1128
1129        Parameters
1130        ----------
1131        destination_table : str
1132            Name of table to be written, in the form 'dataset.tablename'.
1133        project_id : str
1134            Google BigQuery Account project ID.
1135        chunksize : int, optional
1136            Number of rows to be inserted in each chunk from the dataframe.
1137            Set to ``None`` to load the whole dataframe at once.
1138        reauth : bool, default False
1139            Force Google BigQuery to reauthenticate the user. This is useful
1140            if multiple accounts are used.
1141        if_exists : str, default 'fail'
1142            Behavior when the destination table exists. Value can be one of:
1143
1144            ``'fail'``
1145                If table exists, do nothing.
1146            ``'replace'``
1147                If table exists, drop it, recreate it, and insert data.
1148            ``'append'``
1149                If table exists, insert data. Create if does not exist.
1150        private_key : str, optional
1151            Service account private key in JSON format. Can be file path
1152            or string contents. This is useful for remote server
1153            authentication (eg. Jupyter/IPython notebook on remote host).
1154        auth_local_webserver : bool, default False
1155            Use the `local webserver flow`_ instead of the `console flow`_
1156            when getting user credentials.
1157
1158            .. _local webserver flow:
1159                http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
1160            .. _console flow:
1161                http://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
1162
1163            *New in version 0.2.0 of pandas-gbq*.
1164        table_schema : list of dicts, optional
1165            List of BigQuery table fields to which according DataFrame
1166            columns conform to, e.g. ``[{'name': 'col1', 'type':
1167            'STRING'},...]``. If schema is not provided, it will be
1168            generated according to dtypes of DataFrame columns. See
1169            BigQuery API documentation on available names of a field.
1170
1171            *New in version 0.3.1 of pandas-gbq*.
1172        verbose : boolean, deprecated
1173            *Deprecated in Pandas-GBQ 0.4.0.* Use the `logging module
1174            to adjust verbosity instead
1175            <https://pandas-gbq.readthedocs.io/en/latest/intro.html#logging>`__.
1176
1177        See Also
1178        --------
1179        pandas_gbq.to_gbq : This function in the pandas-gbq library.
1180        pandas.read_gbq : Read a DataFrame from Google BigQuery.
1181        """
1182        from pandas.io import gbq
1183        return gbq.to_gbq(
1184            self, destination_table, project_id, chunksize=chunksize,
1185            verbose=verbose, reauth=reauth, if_exists=if_exists,
1186            private_key=private_key, auth_local_webserver=auth_local_webserver,
1187            table_schema=table_schema)
1188
1189    @classmethod
1190    def from_records(cls, data, index=None, exclude=None, columns=None,
1191                     coerce_float=False, nrows=None):
1192        """
1193        Convert structured or record ndarray to DataFrame
1194
1195        Parameters
1196        ----------
1197        data : ndarray (structured dtype), list of tuples, dict, or DataFrame
1198        index : string, list of fields, array-like
1199            Field of array to use as the index, alternately a specific set of
1200            input labels to use
1201        exclude : sequence, default None
1202            Columns or fields to exclude
1203        columns : sequence, default None
1204            Column names to use. If the passed data do not have names
1205            associated with them, this argument provides names for the
1206            columns. Otherwise this argument indicates the order of the columns
1207            in the result (any names not found in the data will become all-NA
1208            columns)
1209        coerce_float : boolean, default False
1210            Attempt to convert values of non-string, non-numeric objects (like
1211            decimal.Decimal) to floating point, useful for SQL result sets
1212
1213        Returns
1214        -------
1215        df : DataFrame
1216        """
1217
1218        # Make a copy of the input columns so we can modify it
1219        if columns is not None:
1220            columns = _ensure_index(columns)
1221
1222        if is_iterator(data):
1223            if nrows == 0:
1224                return cls()
1225
1226            try:
1227                first_row = next(data)
1228            except StopIteration:
1229                return cls(index=index, columns=columns)
1230
1231            dtype = None
1232            if hasattr(first_row, 'dtype') and first_row.dtype.names:
1233                dtype = first_row.dtype
1234
1235            values = [first_row]
1236
1237            if nrows is None:
1238                values += data
1239            else:
1240                values.extend(itertools.islice(data, nrows - 1))
1241
1242            if dtype is not None:
1243                data = np.array(values, dtype=dtype)
1244            else:
1245                data = values
1246
1247        if isinstance(data, dict):
1248            if columns is None:
1249                columns = arr_columns = _ensure_index(sorted(data))
1250                arrays = [data[k] for k in columns]
1251            else:
1252                arrays = []
1253                arr_columns = []
1254                for k, v in compat.iteritems(data):
1255                    if k in columns:
1256                        arr_columns.append(k)
1257                        arrays.append(v)
1258
1259                arrays, arr_columns = _reorder_arrays(arrays, arr_columns,
1260                                                      columns)
1261
1262        elif isinstance(data, (np.ndarray, DataFrame)):
1263            arrays, columns = _to_arrays(data, columns)
1264            if columns is not None:
1265                columns = _ensure_index(columns)
1266            arr_columns = columns
1267        else:
1268            arrays, arr_columns = _to_arrays(data, columns,
1269                                             coerce_float=coerce_float)
1270
1271            arr_columns = _ensure_index(arr_columns)
1272            if columns is not None:
1273                columns = _ensure_index(columns)
1274            else:
1275                columns = arr_columns
1276
1277        if exclude is None:
1278            exclude = set()
1279        else:
1280            exclude = set(exclude)
1281
1282        result_index = None
1283        if index is not None:
1284            if (isinstance(index, compat.string_types) or
1285                    not hasattr(index, "__iter__")):
1286                i = columns.get_loc(index)
1287                exclude.add(index)
1288                if len(arrays) > 0:
1289                    result_index = Index(arrays[i], name=index)
1290                else:
1291                    result_index = Index([], name=index)
1292            else:
1293                try:
1294                    to_remove = [arr_columns.get_loc(field) for field in index]
1295                    index_data = [arrays[i] for i in to_remove]
1296                    result_index = _ensure_index_from_sequences(index_data,
1297                                                                names=index)
1298
1299                    exclude.update(index)
1300                except Exception:
1301                    result_index = index
1302
1303        if any(exclude):
1304            arr_exclude = [x for x in exclude if x in arr_columns]
1305            to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
1306            arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
1307
1308            arr_columns = arr_columns.drop(arr_exclude)
1309            columns = columns.drop(exclude)
1310
1311        mgr = _arrays_to_mgr(arrays, arr_columns, result_index, columns)
1312
1313        return cls(mgr)
1314
1315    def to_records(self, index=True, convert_datetime64=None):
1316        """
1317        Convert DataFrame to a NumPy record array.
1318
1319        Index will be put in the 'index' field of the record array if
1320        requested.
1321
1322        Parameters
1323        ----------
1324        index : boolean, default True
1325            Include index in resulting record array, stored in 'index' field.
1326        convert_datetime64 : boolean, default None
1327            .. deprecated:: 0.23.0
1328
1329            Whether to convert the index to datetime.datetime if it is a
1330            DatetimeIndex.
1331
1332        Returns
1333        -------
1334        y : numpy.recarray
1335
1336        See Also
1337        --------
1338        DataFrame.from_records: convert structured or record ndarray
1339            to DataFrame.
1340        numpy.recarray: ndarray that allows field access using
1341            attributes, analogous to typed columns in a
1342            spreadsheet.
1343
1344        Examples
1345        --------
1346        >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
1347        ...                   index=['a', 'b'])
1348        >>> df
1349           A     B
1350        a  1  0.50
1351        b  2  0.75
1352        >>> df.to_records()
1353        rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1354                  dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1355
1356        The index can be excluded from the record array:
1357
1358        >>> df.to_records(index=False)
1359        rec.array([(1, 0.5 ), (2, 0.75)],
1360                  dtype=[('A', '<i8'), ('B', '<f8')])
1361
1362        By default, timestamps are converted to `datetime.datetime`:
1363
1364        >>> df.index = pd.date_range('2018-01-01 09:00', periods=2, freq='min')
1365        >>> df
1366                             A     B
1367        2018-01-01 09:00:00  1  0.50
1368        2018-01-01 09:01:00  2  0.75
1369        >>> df.to_records()
1370        rec.array([(datetime.datetime(2018, 1, 1, 9, 0), 1, 0.5 ),
1371                   (datetime.datetime(2018, 1, 1, 9, 1), 2, 0.75)],
1372                  dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
1373
1374        The timestamp conversion can be disabled so NumPy's datetime64
1375        data type is used instead:
1376
1377        >>> df.to_records(convert_datetime64=False)
1378        rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
1379                   ('2018-01-01T09:01:00.000000000', 2, 0.75)],
1380                  dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
1381        """
1382
1383        if convert_datetime64 is not None:
1384            warnings.warn("The 'convert_datetime64' parameter is "
1385                          "deprecated and will be removed in a future "
1386                          "version",
1387                          FutureWarning, stacklevel=2)
1388
1389        if index:
1390            if is_datetime64_any_dtype(self.index) and convert_datetime64:
1391                ix_vals = [self.index.to_pydatetime()]
1392            else:
1393                if isinstance(self.index, MultiIndex):
1394                    # array of tuples to numpy cols. copy copy copy
1395                    ix_vals = lmap(np.array, zip(*self.index.values))
1396                else:
1397                    ix_vals = [self.index.values]
1398
1399            arrays = ix_vals + [self[c].get_values() for c in self.columns]
1400
1401            count = 0
1402            index_names = list(self.index.names)
1403            if isinstance(self.index, MultiIndex):
1404                for i, n in enumerate(index_names):
1405                    if n is None:
1406                        index_names[i] = 'level_%d' % count
1407                        count += 1
1408            elif index_names[0] is None:
1409                index_names = ['index']
1410            names = (lmap(compat.text_type, index_names) +
1411                     lmap(compat.text_type, self.columns))
1412        else:
1413            arrays = [self[c].get_values() for c in self.columns]
1414            names = lmap(compat.text_type, self.columns)
1415
1416        formats = [v.dtype for v in arrays]
1417        return np.rec.fromarrays(
1418            arrays,
1419            dtype={'names': names, 'formats': formats}
1420        )
1421
1422    @classmethod
1423    def from_items(cls, items, columns=None, orient='columns'):
1424        """Construct a dataframe from a list of tuples
1425
1426        .. deprecated:: 0.23.0
1427          `from_items` is deprecated and will be removed in a future version.
1428          Use :meth:`DataFrame.from_dict(dict(items)) <DataFrame.from_dict>`
1429          instead.
1430          :meth:`DataFrame.from_dict(OrderedDict(items)) <DataFrame.from_dict>`
1431          may be used to preserve the key order.
1432
1433        Convert (key, value) pairs to DataFrame. The keys will be the axis
1434        index (usually the columns, but depends on the specified
1435        orientation). The values should be arrays or Series.
1436
1437        Parameters
1438        ----------
1439        items : sequence of (key, value) pairs
1440            Values should be arrays or Series.
1441        columns : sequence of column labels, optional
1442            Must be passed if orient='index'.
1443        orient : {'columns', 'index'}, default 'columns'
1444            The "orientation" of the data. If the keys of the
1445            input correspond to column labels, pass 'columns'
1446            (default). Otherwise if the keys correspond to the index,
1447            pass 'index'.
1448
1449        Returns
1450        -------
1451        frame : DataFrame
1452        """
1453
1454        warnings.warn("from_items is deprecated. Please use "
1455                      "DataFrame.from_dict(dict(items), ...) instead. "
1456                      "DataFrame.from_dict(OrderedDict(items)) may be used to "
1457                      "preserve the key order.",
1458                      FutureWarning, stacklevel=2)
1459
1460        keys, values = lzip(*items)
1461
1462        if orient == 'columns':
1463            if columns is not None:
1464                columns = _ensure_index(columns)
1465
1466                idict = dict(items)
1467                if len(idict) < len(items):
1468                    if not columns.equals(_ensure_index(keys)):
1469                        raise ValueError('With non-unique item names, passed '
1470                                         'columns must be identical')
1471                    arrays = values
1472                else:
1473                    arrays = [idict[k] for k in columns if k in idict]
1474            else:
1475                columns = _ensure_index(keys)
1476                arrays = values
1477
1478            # GH 17312
1479            # Provide more informative error msg when scalar values passed
1480            try:
1481                return cls._from_arrays(arrays, columns, None)
1482
1483            except ValueError:
1484                if not is_nested_list_like(values):
1485                    raise ValueError('The value in each (key, value) pair '
1486                                     'must be an array, Series, or dict')
1487
1488        elif orient == 'index':
1489            if columns is None:
1490                raise TypeError("Must pass columns with orient='index'")
1491
1492            keys = _ensure_index(keys)
1493
1494            # GH 17312
1495            # Provide more informative error msg when scalar values passed
1496            try:
1497                arr = np.array(values, dtype=object).T
1498                data = [lib.maybe_convert_objects(v) for v in arr]
1499                return cls._from_arrays(data, columns, keys)
1500
1501            except TypeError:
1502                if not is_nested_list_like(values):
1503                    raise ValueError('The value in each (key, value) pair '
1504                                     'must be an array, Series, or dict')
1505
1506        else:  # pragma: no cover
1507            raise ValueError("'orient' must be either 'columns' or 'index'")
1508
1509    @classmethod
1510    def _from_arrays(cls, arrays, columns, index, dtype=None):
1511        mgr = _arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
1512        return cls(mgr)
1513
1514    @classmethod
1515    def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True,
1516                 encoding=None, tupleize_cols=None,
1517                 infer_datetime_format=False):
1518        """Read CSV file.
1519
1520        .. deprecated:: 0.21.0
1521            Use :func:`pandas.read_csv` instead.
1522
1523        It is preferable to use the more powerful :func:`pandas.read_csv`
1524        for most general purposes, but ``from_csv`` makes for an easy
1525        roundtrip to and from a file (the exact counterpart of
1526        ``to_csv``), especially with a DataFrame of time series data.
1527
1528        This method only differs from the preferred :func:`pandas.read_csv`
1529        in some defaults:
1530
1531        - `index_col` is ``0`` instead of ``None`` (take first column as index
1532          by default)
1533        - `parse_dates` is ``True`` instead of ``False`` (try parsing the index
1534          as datetime by default)
1535
1536        So a ``pd.DataFrame.from_csv(path)`` can be replaced by
1537        ``pd.read_csv(path, index_col=0, parse_dates=True)``.
1538
1539        Parameters
1540        ----------
1541        path : string file path or file handle / StringIO
1542        header : int, default 0
1543            Row to use as header (skip prior rows)
1544        sep : string, default ','
1545            Field delimiter
1546        index_col : int or sequence, default 0
1547            Column to use for index. If a sequence is given, a MultiIndex
1548            is used. Different default from read_table
1549        parse_dates : boolean, default True
1550            Parse dates. Different default from read_table
1551        tupleize_cols : boolean, default False
1552            write multi_index columns as a list of tuples (if True)
1553            or new (expanded format) if False)
1554        infer_datetime_format: boolean, default False
1555            If True and `parse_dates` is True for a column, try to infer the
1556            datetime format based on the first datetime string. If the format
1557            can be inferred, there often will be a large parsing speed-up.
1558
1559        See also
1560        --------
1561        pandas.read_csv
1562
1563        Returns
1564        -------
1565        y : DataFrame
1566
1567        """
1568
1569        warnings.warn("from_csv is deprecated. Please use read_csv(...) "
1570                      "instead. Note that some of the default arguments are "
1571                      "different, so please refer to the documentation "
1572                      "for from_csv when changing your function calls",
1573                      FutureWarning, stacklevel=2)
1574
1575        from pandas.io.parsers import read_table
1576        return read_table(path, header=header, sep=sep,
1577                          parse_dates=parse_dates, index_col=index_col,
1578                          encoding=encoding, tupleize_cols=tupleize_cols,
1579                          infer_datetime_format=infer_datetime_format)
1580
1581    def to_sparse(self, fill_value=None, kind='block'):
1582        """
1583        Convert to SparseDataFrame
1584
1585        Parameters
1586        ----------
1587        fill_value : float, default NaN
1588        kind : {'block', 'integer'}
1589
1590        Returns
1591        -------
1592        y : SparseDataFrame
1593        """
1594        from pandas.core.sparse.frame import SparseDataFrame
1595        return SparseDataFrame(self._series, index=self.index,
1596                               columns=self.columns, default_kind=kind,
1597                               default_fill_value=fill_value)
1598
1599    def to_panel(self):
1600        """
1601        Transform long (stacked) format (DataFrame) into wide (3D, Panel)
1602        format.
1603
1604        .. deprecated:: 0.20.0
1605
1606        Currently the index of the DataFrame must be a 2-level MultiIndex. This
1607        may be generalized later
1608
1609        Returns
1610        -------
1611        panel : Panel
1612        """
1613        # only support this kind for now
1614        if (not isinstance(self.index, MultiIndex) or  # pragma: no cover
1615                len(self.index.levels) != 2):
1616            raise NotImplementedError('Only 2-level MultiIndex are supported.')
1617
1618        if not self.index.is_unique:
1619            raise ValueError("Can't convert non-uniquely indexed "
1620                             "DataFrame to Panel")
1621
1622        self._consolidate_inplace()
1623
1624        # minor axis must be sorted
1625        if self.index.lexsort_depth < 2:
1626            selfsorted = self.sort_index(level=0)
1627        else:
1628            selfsorted = self
1629
1630        major_axis, minor_axis = selfsorted.index.levels
1631        major_labels, minor_labels = selfsorted.index.labels
1632        shape = len(major_axis), len(minor_axis)
1633
1634        # preserve names, if any
1635        major_axis = major_axis.copy()
1636        major_axis.name = self.index.names[0]
1637
1638        minor_axis = minor_axis.copy()
1639        minor_axis.name = self.index.names[1]
1640
1641        # create new axes
1642        new_axes = [selfsorted.columns, major_axis, minor_axis]
1643
1644        # create new manager
1645        new_mgr = selfsorted._data.reshape_nd(axes=new_axes,
1646                                              labels=[major_labels,
1647                                                      minor_labels],
1648                                              shape=shape,
1649                                              ref_items=selfsorted.columns)
1650
1651        return self._constructor_expanddim(new_mgr)
1652
1653    def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
1654               columns=None, header=True, index=True, index_label=None,
1655               mode='w', encoding=None, compression=None, quoting=None,
1656               quotechar='"', line_terminator='\n', chunksize=None,
1657               tupleize_cols=None, date_format=None, doublequote=True,
1658               escapechar=None, decimal='.'):
1659        r"""Write DataFrame to a comma-separated values (csv) file
1660
1661        Parameters
1662        ----------
1663        path_or_buf : string or file handle, default None
1664            File path or object, if None is provided the result is returned as
1665            a string.
1666        sep : character, default ','
1667            Field delimiter for the output file.
1668        na_rep : string, default ''
1669            Missing data representation
1670        float_format : string, default None
1671            Format string for floating point numbers
1672        columns : sequence, optional
1673            Columns to write
1674        header : boolean or list of string, default True
1675            Write out the column names. If a list of strings is given it is
1676            assumed to be aliases for the column names
1677        index : boolean, default True
1678            Write row names (index)
1679        index_label : string or sequence, or False, default None
1680            Column label for index column(s) if desired. If None is given, and
1681            `header` and `index` are True, then the index names are used. A
1682            sequence should be given if the DataFrame uses MultiIndex.  If
1683            False do not print fields for index names. Use index_label=False
1684            for easier importing in R
1685        mode : str
1686            Python write mode, default 'w'
1687        encoding : string, optional
1688            A string representing the encoding to use in the output file,
1689            defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
1690        compression : string, optional
1691            A string representing the compression to use in the output file.
1692            Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
1693            used when the first argument is a filename.
1694        line_terminator : string, default ``'\n'``
1695            The newline character or character sequence to use in the output
1696            file
1697        quoting : optional constant from csv module
1698            defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
1699            then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
1700            will treat them as non-numeric
1701        quotechar : string (length 1), default '\"'
1702            character used to quote fields
1703        doublequote : boolean, default True
1704            Control quoting of `quotechar` inside a field
1705        escapechar : string (length 1), default None
1706            character used to escape `sep` and `quotechar` when appropriate
1707        chunksize : int or None
1708            rows to write at a time
1709        tupleize_cols : boolean, default False
1710            .. deprecated:: 0.21.0
1711               This argument will be removed and will always write each row
1712               of the multi-index as a separate row in the CSV file.
1713
1714            Write MultiIndex columns as a list of tuples (if True) or in
1715            the new, expanded format, where each MultiIndex column is a row
1716            in the CSV (if False).
1717        date_format : string, default None
1718            Format string for datetime objects
1719        decimal: string, default '.'
1720            Character recognized as decimal separator. E.g. use ',' for
1721            European data
1722
1723        """
1724
1725        if tupleize_cols is not None:
1726            warnings.warn("The 'tupleize_cols' parameter is deprecated and "
1727                          "will be removed in a future version",
1728                          FutureWarning, stacklevel=2)
1729        else:
1730            tupleize_cols = False
1731
1732        from pandas.io.formats.csvs import CSVFormatter
1733        formatter = CSVFormatter(self, path_or_buf,
1734                                 line_terminator=line_terminator, sep=sep,
1735                                 encoding=encoding,
1736                                 compression=compression, quoting=quoting,
1737                                 na_rep=na_rep, float_format=float_format,
1738                                 cols=columns, header=header, index=index,
1739                                 index_label=index_label, mode=mode,
1740                                 chunksize=chunksize, quotechar=quotechar,
1741                                 tupleize_cols=tupleize_cols,
1742                                 date_format=date_format,
1743                                 doublequote=doublequote,
1744                                 escapechar=escapechar, decimal=decimal)
1745        formatter.save()
1746
1747        if path_or_buf is None:
1748            return formatter.path_or_buf.getvalue()
1749
1750    @Appender(_shared_docs['to_excel'] % _shared_doc_kwargs)
1751    def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
1752                 float_format=None, columns=None, header=True, index=True,
1753                 index_label=None, startrow=0, startcol=0, engine=None,
1754                 merge_cells=True, encoding=None, inf_rep='inf', verbose=True,
1755                 freeze_panes=None):
1756
1757        from pandas.io.formats.excel import ExcelFormatter
1758        formatter = ExcelFormatter(self, na_rep=na_rep, cols=columns,
1759                                   header=header,
1760                                   float_format=float_format, index=index,
1761                                   index_label=index_label,
1762                                   merge_cells=merge_cells,
1763                                   inf_rep=inf_rep)
1764        formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow,
1765                        startcol=startcol, freeze_panes=freeze_panes,
1766                        engine=engine)
1767
1768    def to_stata(self, fname, convert_dates=None, write_index=True,
1769                 encoding="latin-1", byteorder=None, time_stamp=None,
1770                 data_label=None, variable_labels=None, version=114,
1771                 convert_strl=None):
1772        """
1773        Export Stata binary dta files.
1774
1775        Parameters
1776        ----------
1777        fname : path (string), buffer or path object
1778            string, path object (pathlib.Path or py._path.local.LocalPath) or
1779            object implementing a binary write() functions. If using a buffer
1780            then the buffer will not be automatically closed after the file
1781            data has been written.
1782        convert_dates : dict
1783            Dictionary mapping columns containing datetime types to stata
1784            internal format to use when writing the dates. Options are 'tc',
1785            'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
1786            or a name. Datetime columns that do not have a conversion type
1787            specified will be converted to 'tc'. Raises NotImplementedError if
1788            a datetime column has timezone information.
1789        write_index : bool
1790            Write the index to Stata dataset.
1791        encoding : str
1792            Default is latin-1. Unicode is not supported.
1793        byteorder : str
1794            Can be ">", "<", "little", or "big". default is `sys.byteorder`.
1795        time_stamp : datetime
1796            A datetime to use as file creation date.  Default is the current
1797            time.
1798        data_label : str
1799            A label for the data set.  Must be 80 characters or smaller.
1800        variable_labels : dict
1801            Dictionary containing columns as keys and variable labels as
1802            values. Each label must be 80 characters or smaller.
1803
1804            .. versionadded:: 0.19.0
1805
1806        version : {114, 117}
1807            Version to use in the output dta file.  Version 114 can be used
1808            read by Stata 10 and later.  Version 117 can be read by Stata 13
1809            or later. Version 114 limits string variables to 244 characters or
1810            fewer while 117 allows strings with lengths up to 2,000,000
1811            characters.
1812
1813            .. versionadded:: 0.23.0
1814
1815        convert_strl : list, optional
1816            List of column names to convert to string columns to Stata StrL
1817            format. Only available if version is 117.  Storing strings in the
1818            StrL format can produce smaller dta files if strings have more than
1819            8 characters and values are repeated.
1820
1821            .. versionadded:: 0.23.0
1822
1823        Raises
1824        ------
1825        NotImplementedError
1826            * If datetimes contain timezone information
1827            * Column dtype is not representable in Stata
1828        ValueError
1829            * Columns listed in convert_dates are neither datetime64[ns]
1830              or datetime.datetime
1831            * Column listed in convert_dates is not in DataFrame
1832            * Categorical label contains more than 32,000 characters
1833
1834            .. versionadded:: 0.19.0
1835
1836        See Also
1837        --------
1838        pandas.read_stata : Import Stata data files
1839        pandas.io.stata.StataWriter : low-level writer for Stata data files
1840        pandas.io.stata.StataWriter117 : low-level writer for version 117 files
1841
1842        Examples
1843        --------
1844        >>> data.to_stata('./data_file.dta')
1845
1846        Or with dates
1847
1848        >>> data.to_stata('./date_data_file.dta', {2 : 'tw'})
1849
1850        Alternatively you can create an instance of the StataWriter class
1851
1852        >>> writer = StataWriter('./data_file.dta', data)
1853        >>> writer.write_file()
1854
1855        With dates:
1856
1857        >>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'})
1858        >>> writer.write_file()
1859        """
1860        kwargs = {}
1861        if version not in (114, 117):
1862            raise ValueError('Only formats 114 and 117 supported.')
1863        if version == 114:
1864            if convert_strl is not None:
1865                raise ValueError('strl support is only available when using '
1866                                 'format 117')
1867            from pandas.io.stata import StataWriter as statawriter
1868        else:
1869            from pandas.io.stata import StataWriter117 as statawriter
1870            kwargs['convert_strl'] = convert_strl
1871
1872        writer = statawriter(fname, self, convert_dates=convert_dates,
1873                             encoding=encoding, byteorder=byteorder,
1874                             time_stamp=time_stamp, data_label=data_label,
1875                             write_index=write_index,
1876                             variable_labels=variable_labels, **kwargs)
1877        writer.write_file()
1878
1879    def to_feather(self, fname):
1880        """
1881        write out the binary feather-format for DataFrames
1882
1883        .. versionadded:: 0.20.0
1884
1885        Parameters
1886        ----------
1887        fname : str
1888            string file path
1889
1890        """
1891        from pandas.io.feather_format import to_feather
1892        to_feather(self, fname)
1893
1894    def to_parquet(self, fname, engine='auto', compression='snappy',
1895                   **kwargs):
1896        """
1897        Write a DataFrame to the binary parquet format.
1898
1899        .. versionadded:: 0.21.0
1900
1901        This function writes the dataframe as a `parquet file
1902        <https://parquet.apache.org/>`_. You can choose different parquet
1903        backends, and have the option of compression. See
1904        :ref:`the user guide <io.parquet>` for more details.
1905
1906        Parameters
1907        ----------
1908        fname : str
1909            String file path.
1910        engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
1911            Parquet library to use. If 'auto', then the option
1912            ``io.parquet.engine`` is used. The default ``io.parquet.engine``
1913            behavior is to try 'pyarrow', falling back to 'fastparquet' if
1914            'pyarrow' is unavailable.
1915        compression : {'snappy', 'gzip', 'brotli', None}, default 'snappy'
1916            Name of the compression to use. Use ``None`` for no compression.
1917        **kwargs
1918            Additional arguments passed to the parquet library. See
1919            :ref:`pandas io <io.parquet>` for more details.
1920
1921        See Also
1922        --------
1923        read_parquet : Read a parquet file.
1924        DataFrame.to_csv : Write a csv file.
1925        DataFrame.to_sql : Write to a sql table.
1926        DataFrame.to_hdf : Write to hdf.
1927
1928        Notes
1929        -----
1930        This function requires either the `fastparquet
1931        <https://pypi.org/project/fastparquet>`_ or `pyarrow
1932        <https://arrow.apache.org/docs/python/>`_ library.
1933
1934        Examples
1935        --------
1936        >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})
1937        >>> df.to_parquet('df.parquet.gzip', compression='gzip')
1938        >>> pd.read_parquet('df.parquet.gzip')
1939           col1  col2
1940        0     1     3
1941        1     2     4
1942        """
1943        from pandas.io.parquet import to_parquet
1944        to_parquet(self, fname, engine,
1945                   compression=compression, **kwargs)
1946
1947    @Substitution(header='Write out the column names. If a list of strings '
1948                         'is given, it is assumed to be aliases for the '
1949                         'column names')
1950    @Appender(fmt.docstring_to_string, indents=1)
1951    def to_string(self, buf=None, columns=None, col_space=None, header=True,
1952                  index=True, na_rep='NaN', formatters=None, float_format=None,
1953                  sparsify=None, index_names=True, justify=None,
1954                  line_width=None, max_rows=None, max_cols=None,
1955                  show_dimensions=False):
1956        """
1957        Render a DataFrame to a console-friendly tabular output.
1958        """
1959
1960        formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
1961                                           col_space=col_space, na_rep=na_rep,
1962                                           formatters=formatters,
1963                                           float_format=float_format,
1964                                           sparsify=sparsify, justify=justify,
1965                                           index_names=index_names,
1966                                           header=header, index=index,
1967                                           line_width=line_width,
1968                                           max_rows=max_rows,
1969                                           max_cols=max_cols,
1970                                           show_dimensions=show_dimensions)
1971        formatter.to_string()
1972
1973        if buf is None:
1974            result = formatter.buf.getvalue()
1975            return result
1976
1977    @Substitution(header='whether to print column labels, default True')
1978    @Appender(fmt.docstring_to_string, indents=1)
1979    def to_html(self, buf=None, columns=None, col_space=None, header=True,
1980                index=True, na_rep='NaN', formatters=None, float_format=None,
1981                sparsify=None, index_names=True, justify=None, bold_rows=True,
1982                classes=None, escape=True, max_rows=None, max_cols=None,
1983                show_dimensions=False, notebook=False, decimal='.',
1984                border=None, table_id=None):
1985        """
1986        Render a DataFrame as an HTML table.
1987
1988        `to_html`-specific options:
1989
1990        bold_rows : boolean, default True
1991            Make the row labels bold in the output
1992        classes : str or list or tuple, default None
1993            CSS class(es) to apply to the resulting html table
1994        escape : boolean, default True
1995            Convert the characters <, >, and & to HTML-safe sequences.
1996        max_rows : int, optional
1997            Maximum number of rows to show before truncating. If None, show
1998            all.
1999        max_cols : int, optional
2000            Maximum number of columns to show before truncating. If None, show
2001            all.
2002        decimal : string, default '.'
2003            Character recognized as decimal separator, e.g. ',' in Europe
2004
2005            .. versionadded:: 0.18.0
2006
2007        border : int
2008            A ``border=border`` attribute is included in the opening
2009            `<table>` tag. Default ``pd.options.html.border``.
2010
2011            .. versionadded:: 0.19.0
2012
2013        table_id : str, optional
2014            A css id is included in the opening `<table>` tag if specified.
2015
2016            .. versionadded:: 0.23.0
2017
2018        """
2019
2020        if (justify is not None and
2021                justify not in fmt._VALID_JUSTIFY_PARAMETERS):
2022            raise ValueError("Invalid value for justify parameter")
2023
2024        formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
2025                                           col_space=col_space, na_rep=na_rep,
2026                                           formatters=formatters,
2027                                           float_format=float_format,
2028                                           sparsify=sparsify, justify=justify,
2029                                           index_names=index_names,
2030                                           header=header, index=index,
2031                                           bold_rows=bold_rows, escape=escape,
2032                                           max_rows=max_rows,
2033                                           max_cols=max_cols,
2034                                           show_dimensions=show_dimensions,
2035                                           decimal=decimal, table_id=table_id)
2036        # TODO: a generic formatter wld b in DataFrameFormatter
2037        formatter.to_html(classes=classes, notebook=notebook, border=border)
2038
2039        if buf is None:
2040            return formatter.buf.getvalue()
2041
2042    def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None,
2043             null_counts=None):
2044        """
2045        Print a concise summary of a DataFrame.
2046
2047        This method prints information about a DataFrame including
2048        the index dtype and column dtypes, non-null values and memory usage.
2049
2050        Parameters
2051        ----------
2052        verbose : bool, optional
2053            Whether to print the full summary. By default, the setting in
2054            ``pandas.options.display.max_info_columns`` is followed.
2055        buf : writable buffer, defaults to sys.stdout
2056            Where to send the output. By default, the output is printed to
2057            sys.stdout. Pass a writable buffer if you need to further process
2058            the output.
2059        max_cols : int, optional
2060            When to switch from the verbose to the truncated output. If the
2061            DataFrame has more than `max_cols` columns, the truncated output
2062            is used. By default, the setting in
2063            ``pandas.options.display.max_info_columns`` is used.
2064        memory_usage : bool, str, optional
2065            Specifies whether total memory usage of the DataFrame
2066            elements (including the index) should be displayed. By default,
2067            this follows the ``pandas.options.display.memory_usage`` setting.
2068
2069            True always show memory usage. False never shows memory usage.
2070            A value of 'deep' is equivalent to "True with deep introspection".
2071            Memory usage is shown in human-readable units (base-2
2072            representation). Without deep introspection a memory estimation is
2073            made based in column dtype and number of rows assuming values
2074            consume the same memory amount for corresponding dtypes. With deep
2075            memory introspection, a real memory usage calculation is performed
2076            at the cost of computational resources.
2077        null_counts : bool, optional
2078            Whether to show the non-null counts. By default, this is shown
2079            only if the frame is smaller than
2080            ``pandas.options.display.max_info_rows`` and
2081            ``pandas.options.display.max_info_columns``. A value of True always
2082            shows the counts, and False never shows the counts.
2083
2084        Returns
2085        -------
2086        None
2087            This method prints a summary of a DataFrame and returns None.
2088
2089        See Also
2090        --------
2091        DataFrame.describe: Generate descriptive statistics of DataFrame
2092            columns.
2093        DataFrame.memory_usage: Memory usage of DataFrame columns.
2094
2095        Examples
2096        --------
2097        >>> int_values = [1, 2, 3, 4, 5]
2098        >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
2099        >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
2100        >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
2101        ...                   "float_col": float_values})
2102        >>> df
2103           int_col text_col  float_col
2104        0        1    alpha       0.00
2105        1        2     beta       0.25
2106        2        3    gamma       0.50
2107        3        4    delta       0.75
2108        4        5  epsilon       1.00
2109
2110        Prints information of all columns:
2111
2112        >>> df.info(verbose=True)
2113        <class 'pandas.core.frame.DataFrame'>
2114        RangeIndex: 5 entries, 0 to 4
2115        Data columns (total 3 columns):
2116        int_col      5 non-null int64
2117        text_col     5 non-null object
2118        float_col    5 non-null float64
2119        dtypes: float64(1), int64(1), object(1)
2120        memory usage: 200.0+ bytes
2121
2122        Prints a summary of columns count and its dtypes but not per column
2123        information:
2124
2125        >>> df.info(verbose=False)
2126        <class 'pandas.core.frame.DataFrame'>
2127        RangeIndex: 5 entries, 0 to 4
2128        Columns: 3 entries, int_col to float_col
2129        dtypes: float64(1), int64(1), object(1)
2130        memory usage: 200.0+ bytes
2131
2132        Pipe output of DataFrame.info to buffer instead of sys.stdout, get
2133        buffer content and writes to a text file:
2134
2135        >>> import io
2136        >>> buffer = io.StringIO()
2137        >>> df.info(buf=buffer)
2138        >>> s = buffer.getvalue()
2139        >>> with open("df_info.txt", "w", encoding="utf-8") as f:
2140        ...     f.write(s)
2141        260
2142
2143        The `memory_usage` parameter allows deep introspection mode, specially
2144        useful for big DataFrames and fine-tune memory optimization:
2145
2146        >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6)
2147        >>> df = pd.DataFrame({
2148        ...     'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6),
2149        ...     'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6),
2150        ...     'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6)
2151        ... })
2152        >>> df.info()
2153        <class 'pandas.core.frame.DataFrame'>
2154        RangeIndex: 1000000 entries, 0 to 999999
2155        Data columns (total 3 columns):
2156        column_1    1000000 non-null object
2157        column_2    1000000 non-null object
2158        column_3    1000000 non-null object
2159        dtypes: object(3)
2160        memory usage: 22.9+ MB
2161
2162        >>> df.info(memory_usage='deep')
2163        <class 'pandas.core.frame.DataFrame'>
2164        RangeIndex: 1000000 entries, 0 to 999999
2165        Data columns (total 3 columns):
2166        column_1    1000000 non-null object
2167        column_2    1000000 non-null object
2168        column_3    1000000 non-null object
2169        dtypes: object(3)
2170        memory usage: 188.8 MB
2171        """
2172
2173        if buf is None:  # pragma: no cover
2174            buf = sys.stdout
2175
2176        lines = []
2177
2178        lines.append(str(type(self)))
2179        lines.append(self.index._summary())
2180
2181        if len(self.columns) == 0:
2182            lines.append('Empty {name}'.format(name=type(self).__name__))
2183            fmt.buffer_put_lines(buf, lines)
2184            return
2185
2186        cols = self.columns
2187
2188        # hack
2189        if max_cols is None:
2190            max_cols = get_option('display.max_info_columns',
2191                                  len(self.columns) + 1)
2192
2193        max_rows = get_option('display.max_info_rows', len(self) + 1)
2194
2195        if null_counts is None:
2196            show_counts = ((len(self.columns) <= max_cols) and
2197                           (len(self) < max_rows))
2198        else:
2199            show_counts = null_counts
2200        exceeds_info_cols = len(self.columns) > max_cols
2201
2202        def _verbose_repr():
2203            lines.append('Data columns (total %d columns):' %
2204                         len(self.columns))
2205            space = max(len(pprint_thing(k)) for k in self.columns) + 4
2206            counts = None
2207
2208            tmpl = "{count}{dtype}"
2209            if show_counts:
2210                counts = self.count()
2211                if len(cols) != len(counts):  # pragma: no cover
2212                    raise AssertionError(
2213                        'Columns must equal counts '
2214                        '({cols:d} != {counts:d})'.format(
2215                            cols=len(cols), counts=len(counts)))
2216                tmpl = "{count} non-null {dtype}"
2217
2218            dtypes = self.dtypes
2219            for i, col in enumerate(self.columns):
2220                dtype = dtypes.iloc[i]
2221                col = pprint_thing(col)
2222
2223                count = ""
2224                if show_counts:
2225                    count = counts.iloc[i]
2226
2227                lines.append(_put_str(col, space) + tmpl.format(count=count,
2228                                                                dtype=dtype))
2229
2230        def _non_verbose_repr():
2231            lines.append(self.columns._summary(name='Columns'))
2232
2233        def _sizeof_fmt(num, size_qualifier):
2234            # returns size in human readable format
2235            for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
2236                if num < 1024.0:
2237                    return ("{num:3.1f}{size_q} "
2238                            "{x}".format(num=num, size_q=size_qualifier, x=x))
2239                num /= 1024.0
2240            return "{num:3.1f}{size_q} {pb}".format(num=num,
2241                                                    size_q=size_qualifier,
2242                                                    pb='PB')
2243
2244        if verbose:
2245            _verbose_repr()
2246        elif verbose is False:  # specifically set to False, not nesc None
2247            _non_verbose_repr()
2248        else:
2249            if exceeds_info_cols:
2250                _non_verbose_repr()
2251            else:
2252                _verbose_repr()
2253
2254        counts = self.get_dtype_counts()
2255        dtypes = ['{k}({kk:d})'.format(k=k[0], kk=k[1]) for k
2256                  in sorted(compat.iteritems(counts))]
2257        lines.append('dtypes: {types}'.format(types=', '.join(dtypes)))
2258
2259        if memory_usage is None:
2260            memory_usage = get_option('display.memory_usage')
2261        if memory_usage:
2262            # append memory usage of df to display
2263            size_qualifier = ''
2264            if memory_usage == 'deep':
2265                deep = True
2266            else:
2267                # size_qualifier is just a best effort; not guaranteed to catch
2268                # all cases (e.g., it misses categorical data even with object
2269                # categories)
2270                deep = False
2271                if ('object' in counts or
2272                        self.index._is_memory_usage_qualified()):
2273                    size_qualifier = '+'
2274            mem_usage = self.memory_usage(index=True, deep=deep).sum()
2275            lines.append("memory usage: {mem}\n".format(
2276                mem=_sizeof_fmt(mem_usage, size_qualifier)))
2277
2278        fmt.buffer_put_lines(buf, lines)
2279
2280    def memory_usage(self, index=True, deep=False):
2281        """
2282        Return the memory usage of each column in bytes.
2283
2284        The memory usage can optionally include the contribution of
2285        the index and elements of `object` dtype.
2286
2287        This value is displayed in `DataFrame.info` by default. This can be
2288        suppressed by setting ``pandas.options.display.memory_usage`` to False.
2289
2290        Parameters
2291        ----------
2292        index : bool, default True
2293            Specifies whether to include the memory usage of the DataFrame's
2294            index in returned Series. If ``index=True`` the memory usage of the
2295            index the first item in the output.
2296        deep : bool, default False
2297            If True, introspect the data deeply by interrogating
2298            `object` dtypes for system-level memory consumption, and include
2299            it in the returned values.
2300
2301        Returns
2302        -------
2303        sizes : Series
2304            A Series whose index is the original column names and whose values
2305            is the memory usage of each column in bytes.
2306
2307        See Also
2308        --------
2309        numpy.ndarray.nbytes : Total bytes consumed by the elements of an
2310            ndarray.
2311        Series.memory_usage : Bytes consumed by a Series.
2312        pandas.Categorical : Memory-efficient array for string values with
2313            many repeated values.
2314        DataFrame.info : Concise summary of a DataFrame.
2315
2316        Examples
2317        --------
2318        >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
2319        >>> data = dict([(t, np.ones(shape=5000).astype(t))
2320        ...              for t in dtypes])
2321        >>> df = pd.DataFrame(data)
2322        >>> df.head()
2323           int64  float64  complex128 object  bool
2324        0      1      1.0      (1+0j)      1  True
2325        1      1      1.0      (1+0j)      1  True
2326        2      1      1.0      (1+0j)      1  True
2327        3      1      1.0      (1+0j)      1  True
2328        4      1      1.0      (1+0j)      1  True
2329
2330        >>> df.memory_usage()
2331        Index            80
2332        int64         40000
2333        float64       40000
2334        complex128    80000
2335        object        40000
2336        bool           5000
2337        dtype: int64
2338
2339        >>> df.memory_usage(index=False)
2340        int64         40000
2341        float64       40000
2342        complex128    80000
2343        object        40000
2344        bool           5000
2345        dtype: int64
2346
2347        The memory footprint of `object` dtype columns is ignored by default:
2348
2349        >>> df.memory_usage(deep=True)
2350        Index             80
2351        int64          40000
2352        float64        40000
2353        complex128     80000
2354        object        160000
2355        bool            5000
2356        dtype: int64
2357
2358        Use a Categorical for efficient storage of an object-dtype column with
2359        many repeated values.
2360
2361        >>> df['object'].astype('category').memory_usage(deep=True)
2362        5168
2363        """
2364        result = Series([c.memory_usage(index=False, deep=deep)
2365                         for col, c in self.iteritems()], index=self.columns)
2366        if index:
2367            result = Series(self.index.memory_usage(deep=deep),
2368                            index=['Index']).append(result)
2369        return result
2370
2371    def transpose(self, *args, **kwargs):
2372        """
2373        Transpose index and columns.
2374
2375        Reflect the DataFrame over its main diagonal by writing rows as columns
2376        and vice-versa. The property :attr:`.T` is an accessor to the method
2377        :meth:`transpose`.
2378
2379        Parameters
2380        ----------
2381        copy : bool, default False
2382            If True, the underlying data is copied. Otherwise (default), no
2383            copy is made if possible.
2384        *args, **kwargs
2385            Additional keywords have no effect but might be accepted for
2386            compatibility with numpy.
2387
2388        Returns
2389        -------
2390        DataFrame
2391            The transposed DataFrame.
2392
2393        See Also
2394        --------
2395        numpy.transpose : Permute the dimensions of a given array.
2396
2397        Notes
2398        -----
2399        Transposing a DataFrame with mixed dtypes will result in a homogeneous
2400        DataFrame with the `object` dtype. In such a case, a copy of the data
2401        is always made.
2402
2403        Examples
2404        --------
2405        **Square DataFrame with homogeneous dtype**
2406
2407        >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
2408        >>> df1 = pd.DataFrame(data=d1)
2409        >>> df1
2410           col1  col2
2411        0     1     3
2412        1     2     4
2413
2414        >>> df1_transposed = df1.T # or df1.transpose()
2415        >>> df1_transposed
2416              0  1
2417        col1  1  2
2418        col2  3  4
2419
2420        When the dtype is homogeneous in the original DataFrame, we get a
2421        transposed DataFrame with the same dtype:
2422
2423        >>> df1.dtypes
2424        col1    int64
2425        col2    int64
2426        dtype: object
2427        >>> df1_transposed.dtypes
2428        0    int64
2429        1    int64
2430        dtype: object
2431
2432        **Non-square DataFrame with mixed dtypes**
2433
2434        >>> d2 = {'name': ['Alice', 'Bob'],
2435        ...       'score': [9.5, 8],
2436        ...       'employed': [False, True],
2437        ...       'kids': [0, 0]}
2438        >>> df2 = pd.DataFrame(data=d2)
2439        >>> df2
2440            name  score  employed  kids
2441        0  Alice    9.5     False     0
2442        1    Bob    8.0      True     0
2443
2444        >>> df2_transposed = df2.T # or df2.transpose()
2445        >>> df2_transposed
2446                      0     1
2447        name      Alice   Bob
2448        score       9.5     8
2449        employed  False  True
2450        kids          0     0
2451
2452        When the DataFrame has mixed dtypes, we get a transposed DataFrame with
2453        the `object` dtype:
2454
2455        >>> df2.dtypes
2456        name         object
2457        score       float64
2458        employed       bool
2459        kids          int64
2460        dtype: object
2461        >>> df2_transposed.dtypes
2462        0    object
2463        1    object
2464        dtype: object
2465        """
2466        nv.validate_transpose(args, dict())
2467        return super(DataFrame, self).transpose(1, 0, **kwargs)
2468
2469    T = property(transpose)
2470
2471    # ----------------------------------------------------------------------
2472    # Picklability
2473
2474    # legacy pickle formats
2475    def _unpickle_frame_compat(self, state):  # pragma: no cover
2476        if len(state) == 2:  # pragma: no cover
2477            series, idx = state
2478            columns = sorted(series)
2479        else:
2480            series, cols, idx = state
2481            columns = com._unpickle_array(cols)
2482
2483        index = com._unpickle_array(idx)
2484        self._data = self._init_dict(series, index, columns, None)
2485
2486    def _unpickle_matrix_compat(self, state):  # pragma: no cover
2487        # old unpickling
2488        (vals, idx, cols), object_state = state
2489
2490        index = com._unpickle_array(idx)
2491        dm = DataFrame(vals, index=index, columns=com._unpickle_array(cols),
2492                       copy=False)
2493
2494        if object_state is not None:
2495            ovals, _, ocols = object_state
2496            objects = DataFrame(ovals, index=index,
2497                                columns=com._unpickle_array(ocols), copy=False)
2498
2499            dm = dm.join(objects)
2500
2501        self._data = dm._data
2502
2503    # ----------------------------------------------------------------------
2504    # Getting and setting elements
2505
2506    def get_value(self, index, col, takeable=False):
2507        """Quickly retrieve single value at passed column and index
2508
2509        .. deprecated:: 0.21.0
2510            Use .at[] or .iat[] accessors instead.
2511
2512        Parameters
2513        ----------
2514        index : row label
2515        col : column label
2516        takeable : interpret the index/col as indexers, default False
2517
2518        Returns
2519        -------
2520        value : scalar value
2521        """
2522
2523        warnings.warn("get_value is deprecated and will be removed "
2524                      "in a future release. Please use "
2525                      ".at[] or .iat[] accessors instead", FutureWarning,
2526                      stacklevel=2)
2527        return self._get_value(index, col, takeable=takeable)
2528
2529    def _get_value(self, index, col, takeable=False):
2530
2531        if takeable:
2532            series = self._iget_item_cache(col)
2533            return com._maybe_box_datetimelike(series._values[index])
2534
2535        series = self._get_item_cache(col)
2536        engine = self.index._engine
2537
2538        try:
2539            return engine.get_value(series._values, index)
2540        except (TypeError, ValueError):
2541
2542            # we cannot handle direct indexing
2543            # use positional
2544            col = self.columns.get_loc(col)
2545            index = self.index.get_loc(index)
2546            return self._get_value(index, col, takeable=True)
2547    _get_value.__doc__ = get_value.__doc__
2548
2549    def set_value(self, index, col, value, takeable=False):
2550        """Put single value at passed column and index
2551
2552        .. deprecated:: 0.21.0
2553            Use .at[] or .iat[] accessors instead.
2554
2555        Parameters
2556        ----------
2557        index : row label
2558        col : column label
2559        value : scalar value
2560        takeable : interpret the index/col as indexers, default False
2561
2562        Returns
2563        -------
2564        frame : DataFrame
2565            If label pair is contained, will be reference to calling DataFrame,
2566            otherwise a new object
2567        """
2568        warnings.warn("set_value is deprecated and will be removed "
2569                      "in a future release. Please use "
2570                      ".at[] or .iat[] accessors instead", FutureWarning,
2571                      stacklevel=2)
2572        return self._set_value(index, col, value, takeable=takeable)
2573
2574    def _set_value(self, index, col, value, takeable=False):
2575        try:
2576            if takeable is True:
2577                series = self._iget_item_cache(col)
2578                return series._set_value(index, value, takeable=True)
2579
2580            series = self._get_item_cache(col)
2581            engine = self.index._engine
2582            engine.set_value(series._values, index, value)
2583            return self
2584        except (KeyError, TypeError):
2585
2586            # set using a non-recursive method & reset the cache
2587            self.loc[index, col] = value
2588            self._item_cache.pop(col, None)
2589
2590            return self
2591    _set_value.__doc__ = set_value.__doc__
2592
2593    def _ixs(self, i, axis=0):
2594        """
2595        i : int, slice, or sequence of integers
2596        axis : int
2597        """
2598
2599        # irow
2600        if axis == 0:
2601            """
2602            Notes
2603            -----
2604            If slice passed, the resulting data will be a view
2605            """
2606
2607            if isinstance(i, slice):
2608                return self[i]
2609            else:
2610                label = self.index[i]
2611                if isinstance(label, Index):
2612                    # a location index by definition
2613                    result = self.take(i, axis=axis)
2614                    copy = True
2615                else:
2616                    new_values = self._data.fast_xs(i)
2617                    if is_scalar(new_values):
2618                        return new_values
2619
2620                    # if we are a copy, mark as such
2621                    copy = (isinstance(new_values, np.ndarray) and
2622                            new_values.base is None)
2623                    result = self._constructor_sliced(new_values,
2624                                                      index=self.columns,
2625                                                      name=self.index[i],
2626                                                      dtype=new_values.dtype)
2627                result._set_is_copy(self, copy=copy)
2628                return result
2629
2630        # icol
2631        else:
2632            """
2633            Notes
2634            -----
2635            If slice passed, the resulting data will be a view
2636            """
2637
2638            label = self.columns[i]
2639            if isinstance(i, slice):
2640                # need to return view
2641                lab_slice = slice(label[0], label[-1])
2642                return self.loc[:, lab_slice]
2643            else:
2644                if isinstance(label, Index):
2645                    return self._take(i, axis=1)
2646
2647                index_len = len(self.index)
2648
2649                # if the values returned are not the same length
2650                # as the index (iow a not found value), iget returns
2651                # a 0-len ndarray. This is effectively catching
2652                # a numpy error (as numpy should really raise)
2653                values = self._data.iget(i)
2654
2655                if index_len and not len(values):
2656                    values = np.array([np.nan] * index_len, dtype=object)
2657                result = self._box_col_values(values, label)
2658
2659                # this is a cached value, mark it so
2660                result._set_as_cached(label, self)
2661
2662                return result
2663
2664    def __getitem__(self, key):
2665        key = com._apply_if_callable(key, self)
2666
2667        # shortcut if we are an actual column
2668        is_mi_columns = isinstance(self.columns, MultiIndex)
2669        try:
2670            if key in self.columns and not is_mi_columns:
2671                return self._getitem_column(key)
2672        except:
2673            pass
2674
2675        # see if we can slice the rows
2676        indexer = convert_to_index_sliceable(self, key)
2677        if indexer is not None:
2678            return self._getitem_slice(indexer)
2679
2680        if isinstance(key, (Series, np.ndarray, Index, list)):
2681            # either boolean or fancy integer index
2682            return self._getitem_array(key)
2683        elif isinstance(key, DataFrame):
2684            return self._getitem_frame(key)
2685        elif is_mi_columns:
2686            return self._getitem_multilevel(key)
2687        else:
2688            return self._getitem_column(key)
2689
2690    def _getitem_column(self, key):
2691        """ return the actual column """
2692
2693        # get column
2694        if self.columns.is_unique:
2695            return self._get_item_cache(key)
2696
2697        # duplicate columns & possible reduce dimensionality
2698        result = self._constructor(self._data.get(key))
2699        if result.columns.is_unique:
2700            result = result[key]
2701
2702        return result
2703
2704    def _getitem_slice(self, key):
2705        return self._slice(key, axis=0)
2706
2707    def _getitem_array(self, key):
2708        # also raises Exception if object array with NA values
2709        if com.is_bool_indexer(key):
2710            # warning here just in case -- previously __setitem__ was
2711            # reindexing but __getitem__ was not; it seems more reasonable to
2712            # go with the __setitem__ behavior since that is more consistent
2713            # with all other indexing behavior
2714            if isinstance(key, Series) and not key.index.equals(self.index):
2715                warnings.warn("Boolean Series key will be reindexed to match "
2716                              "DataFrame index.", UserWarning, stacklevel=3)
2717            elif len(key) != len(self.index):
2718                raise ValueError('Item wrong length %d instead of %d.' %
2719                                 (len(key), len(self.index)))
2720            # check_bool_indexer will throw exception if Series key cannot
2721            # be reindexed to match DataFrame rows
2722            key = check_bool_indexer(self.index, key)
2723            indexer = key.nonzero()[0]
2724            return self._take(indexer, axis=0)
2725        else:
2726            indexer = self.loc._convert_to_indexer(key, axis=1)
2727            return self._take(indexer, axis=1)
2728
2729    def _getitem_multilevel(self, key):
2730        loc = self.columns.get_loc(key)
2731        if isinstance(loc, (slice, Series, np.ndarray, Index)):
2732            new_columns = self.columns[loc]
2733            result_columns = maybe_droplevels(new_columns, key)
2734            if self._is_mixed_type:
2735                result = self.reindex(columns=new_columns)
2736                result.columns = result_columns
2737            else:
2738                new_values = self.values[:, loc]
2739                result = self._constructor(new_values, index=self.index,
2740                                           columns=result_columns)
2741                result = result.__finalize__(self)
2742
2743            # If there is only one column being returned, and its name is
2744            # either an empty string, or a tuple with an empty string as its
2745            # first element, then treat the empty string as a placeholder
2746            # and return the column as if the user had provided that empty
2747            # string in the key. If the result is a Series, exclude the
2748            # implied empty string from its name.
2749            if len(result.columns) == 1:
2750                top = result.columns[0]
2751                if isinstance(top, tuple):
2752                    top = top[0]
2753                if top == '':
2754                    result = result['']
2755                    if isinstance(result, Series):
2756                        result = self._constructor_sliced(result,
2757                                                          index=self.index,
2758                                                          name=key)
2759
2760            result._set_is_copy(self)
2761            return result
2762        else:
2763            return self._get_item_cache(key)
2764
2765    def _getitem_frame(self, key):
2766        if key.values.size and not is_bool_dtype(key.values):
2767            raise ValueError('Must pass DataFrame with boolean values only')
2768        return self.where(key)
2769
2770    def query(self, expr, inplace=False, **kwargs):
2771        """Query the columns of a frame with a boolean expression.
2772
2773        Parameters
2774        ----------
2775        expr : string
2776            The query string to evaluate.  You can refer to variables
2777            in the environment by prefixing them with an '@' character like
2778            ``@a + b``.
2779        inplace : bool
2780            Whether the query should modify the data in place or return
2781            a modified copy
2782
2783            .. versionadded:: 0.18.0
2784
2785        kwargs : dict
2786            See the documentation for :func:`pandas.eval` for complete details
2787            on the keyword arguments accepted by :meth:`DataFrame.query`.
2788
2789        Returns
2790        -------
2791        q : DataFrame
2792
2793        Notes
2794        -----
2795        The result of the evaluation of this expression is first passed to
2796        :attr:`DataFrame.loc` and if that fails because of a
2797        multidimensional key (e.g., a DataFrame) then the result will be passed
2798        to :meth:`DataFrame.__getitem__`.
2799
2800        This method uses the top-level :func:`pandas.eval` function to
2801        evaluate the passed query.
2802
2803        The :meth:`~pandas.DataFrame.query` method uses a slightly
2804        modified Python syntax by default. For example, the ``&`` and ``|``
2805        (bitwise) operators have the precedence of their boolean cousins,
2806        :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
2807        however the semantics are different.
2808
2809        You can change the semantics of the expression by passing the keyword
2810        argument ``parser='python'``. This enforces the same semantics as
2811        evaluation in Python space. Likewise, you can pass ``engine='python'``
2812        to evaluate an expression using Python itself as a backend. This is not
2813        recommended as it is inefficient compared to using ``numexpr`` as the
2814        engine.
2815
2816        The :attr:`DataFrame.index` and
2817        :attr:`DataFrame.columns` attributes of the
2818        :class:`~pandas.DataFrame` instance are placed in the query namespace
2819        by default, which allows you to treat both the index and columns of the
2820        frame as a column in the frame.
2821        The identifier ``index`` is used for the frame index; you can also
2822        use the name of the index to identify it in a query. Please note that
2823        Python keywords may not be used as identifiers.
2824
2825        For further details and examples see the ``query`` documentation in
2826        :ref:`indexing <indexing.query>`.
2827
2828        See Also
2829        --------
2830        pandas.eval
2831        DataFrame.eval
2832
2833        Examples
2834        --------
2835        >>> from numpy.random import randn
2836        >>> from pandas import DataFrame
2837        >>> df = pd.DataFrame(randn(10, 2), columns=list('ab'))
2838        >>> df.query('a > b')
2839        >>> df[df.a > df.b]  # same result as the previous expression
2840        """
2841        inplace = validate_bool_kwarg(inplace, 'inplace')
2842        if not isinstance(expr, compat.string_types):
2843            msg = "expr must be a string to be evaluated, {0} given"
2844            raise ValueError(msg.format(type(expr)))
2845        kwargs['level'] = kwargs.pop('level', 0) + 1
2846        kwargs['target'] = None
2847        res = self.eval(expr, **kwargs)
2848
2849        try:
2850            new_data = self.loc[res]
2851        except ValueError:
2852            # when res is multi-dimensional loc raises, but this is sometimes a
2853            # valid query
2854            new_data = self[res]
2855
2856        if inplace:
2857            self._update_inplace(new_data)
2858        else:
2859            return new_data
2860
2861    def eval(self, expr, inplace=False, **kwargs):
2862        """
2863        Evaluate a string describing operations on DataFrame columns.
2864
2865        Operates on columns only, not specific rows or elements.  This allows
2866        `eval` to run arbitrary code, which can make you vulnerable to code
2867        injection if you pass user input to this function.
2868
2869        Parameters
2870        ----------
2871        expr : str
2872            The expression string to evaluate.
2873        inplace : bool, default False
2874            If the expression contains an assignment, whether to perform the
2875            operation inplace and mutate the existing DataFrame. Otherwise,
2876            a new DataFrame is returned.
2877
2878            .. versionadded:: 0.18.0.
2879        kwargs : dict
2880            See the documentation for :func:`~pandas.eval` for complete details
2881            on the keyword arguments accepted by
2882            :meth:`~pandas.DataFrame.query`.
2883
2884        Returns
2885        -------
2886        ndarray, scalar, or pandas object
2887            The result of the evaluation.
2888
2889        See Also
2890        --------
2891        DataFrame.query : Evaluates a boolean expression to query the columns
2892            of a frame.
2893        DataFrame.assign : Can evaluate an expression or function to create new
2894            values for a column.
2895        pandas.eval : Evaluate a Python expression as a string using various
2896            backends.
2897
2898        Notes
2899        -----
2900        For more details see the API documentation for :func:`~pandas.eval`.
2901        For detailed examples see :ref:`enhancing performance with eval
2902        <enhancingperf.eval>`.
2903
2904        Examples
2905        --------
2906        >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
2907        >>> df
2908           A   B
2909        0  1  10
2910        1  2   8
2911        2  3   6
2912        3  4   4
2913        4  5   2
2914        >>> df.eval('A + B')
2915        0    11
2916        1    10
2917        2     9
2918        3     8
2919        4     7
2920        dtype: int64
2921
2922        Assignment is allowed though by default the original DataFrame is not
2923        modified.
2924
2925        >>> df.eval('C = A + B')
2926           A   B   C
2927        0  1  10  11
2928        1  2   8  10
2929        2  3   6   9
2930        3  4   4   8
2931        4  5   2   7
2932        >>> df
2933           A   B
2934        0  1  10
2935        1  2   8
2936        2  3   6
2937        3  4   4
2938        4  5   2
2939
2940        Use ``inplace=True`` to modify the original DataFrame.
2941
2942        >>> df.eval('C = A + B', inplace=True)
2943        >>> df
2944           A   B   C
2945        0  1  10  11
2946        1  2   8  10
2947        2  3   6   9
2948        3  4   4   8
2949        4  5   2   7
2950        """
2951        from pandas.core.computation.eval import eval as _eval
2952
2953        inplace = validate_bool_kwarg(inplace, 'inplace')
2954        resolvers = kwargs.pop('resolvers', None)
2955        kwargs['level'] = kwargs.pop('level', 0) + 1
2956        if resolvers is None:
2957            index_resolvers = self._get_index_resolvers()
2958            resolvers = dict(self.iteritems()), index_resolvers
2959        if 'target' not in kwargs:
2960            kwargs['target'] = self
2961        kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
2962        return _eval(expr, inplace=inplace, **kwargs)
2963
2964    def select_dtypes(self, include=None, exclude=None):
2965        """
2966        Return a subset of the DataFrame's columns based on the column dtypes.
2967
2968        Parameters
2969        ----------
2970        include, exclude : scalar or list-like
2971            A selection of dtypes or strings to be included/excluded. At least
2972            one of these parameters must be supplied.
2973
2974        Raises
2975        ------
2976        ValueError
2977            * If both of ``include`` and ``exclude`` are empty
2978            * If ``include`` and ``exclude`` have overlapping elements
2979            * If any kind of string dtype is passed in.
2980
2981        Returns
2982        -------
2983        subset : DataFrame
2984            The subset of the frame including the dtypes in ``include`` and
2985            excluding the dtypes in ``exclude``.
2986
2987        Notes
2988        -----
2989        * To select all *numeric* types, use ``np.number`` or ``'number'``
2990        * To select strings you must use the ``object`` dtype, but note that
2991          this will return *all* object dtype columns
2992        * See the `numpy dtype hierarchy
2993          <http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html>`__
2994        * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
2995          ``'datetime64'``
2996        * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
2997          ``'timedelta64'``
2998        * To select Pandas categorical dtypes, use ``'category'``
2999        * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
3000          0.20.0) or ``'datetime64[ns, tz]'``
3001
3002        Examples
3003        --------
3004        >>> df = pd.DataFrame({'a': [1, 2] * 3,
3005        ...                    'b': [True, False] * 3,
3006        ...                    'c': [1.0, 2.0] * 3})
3007        >>> df
3008                a      b  c
3009        0       1   True  1.0
3010        1       2  False  2.0
3011        2       1   True  1.0
3012        3       2  False  2.0
3013        4       1   True  1.0
3014        5       2  False  2.0
3015
3016        >>> df.select_dtypes(include='bool')
3017           b
3018        0  True
3019        1  False
3020        2  True
3021        3  False
3022        4  True
3023        5  False
3024
3025        >>> df.select_dtypes(include=['float64'])
3026           c
3027        0  1.0
3028        1  2.0
3029        2  1.0
3030        3  2.0
3031        4  1.0
3032        5  2.0
3033
3034        >>> df.select_dtypes(exclude=['int'])
3035               b    c
3036        0   True  1.0
3037        1  False  2.0
3038        2   True  1.0
3039        3  False  2.0
3040        4   True  1.0
3041        5  False  2.0
3042        """
3043
3044        if not is_list_like(include):
3045            include = (include,) if include is not None else ()
3046        if not is_list_like(exclude):
3047            exclude = (exclude,) if exclude is not None else ()
3048
3049        selection = tuple(map(frozenset, (include, exclude)))
3050
3051        if not any(selection):
3052            raise ValueError('at least one of include or exclude must be '
3053                             'nonempty')
3054
3055        # convert the myriad valid dtypes object to a single representation
3056        include, exclude = map(
3057            lambda x: frozenset(map(_get_dtype_from_object, x)), selection)
3058        for dtypes in (include, exclude):
3059            invalidate_string_dtypes(dtypes)
3060
3061        # can't both include AND exclude!
3062        if not include.isdisjoint(exclude):
3063            raise ValueError('include and exclude overlap on {inc_ex}'.format(
3064                inc_ex=(include & exclude)))
3065
3066        # empty include/exclude -> defaults to True
3067        # three cases (we've already raised if both are empty)
3068        # case 1: empty include, nonempty exclude
3069        # we have True, True, ... True for include, same for exclude
3070        # in the loop below we get the excluded
3071        # and when we call '&' below we get only the excluded
3072        # case 2: nonempty include, empty exclude
3073        # same as case 1, but with include
3074        # case 3: both nonempty
3075        # the "union" of the logic of case 1 and case 2:
3076        # we get the included and excluded, and return their logical and
3077        include_these = Series(not bool(include), index=self.columns)
3078        exclude_these = Series(not bool(exclude), index=self.columns)
3079
3080        def is_dtype_instance_mapper(idx, dtype):
3081            return idx, functools.partial(issubclass, dtype.type)
3082
3083        for idx, f in itertools.starmap(is_dtype_instance_mapper,
3084                                        enumerate(self.dtypes)):
3085            if include:  # checks for the case of empty include or exclude
3086                include_these.iloc[idx] = any(map(f, include))
3087            if exclude:
3088                exclude_these.iloc[idx] = not any(map(f, exclude))
3089
3090        dtype_indexer = include_these & exclude_these
3091        return self.loc[com._get_info_slice(self, dtype_indexer)]
3092
3093    def _box_item_values(self, key, values):
3094        items = self.columns[self.columns.get_loc(key)]
3095        if values.ndim == 2:
3096            return self._constructor(values.T, columns=items, index=self.index)
3097        else:
3098            return self._box_col_values(values, items)
3099
3100    def _box_col_values(self, values, items):
3101        """ provide boxed values for a column """
3102        klass = _get_sliced_frame_result_type(values, self)
3103        return klass(values, index=self.index, name=items, fastpath=True)
3104
3105    def __setitem__(self, key, value):
3106        key = com._apply_if_callable(key, self)
3107
3108        # see if we can slice the rows
3109        indexer = convert_to_index_sliceable(self, key)
3110        if indexer is not None:
3111            return self._setitem_slice(indexer, value)
3112
3113        if isinstance(key, DataFrame) or getattr(key, 'ndim', None) == 2:
3114            self._setitem_frame(key, value)
3115        elif isinstance(key, (Series, np.ndarray, list, Index)):
3116            self._setitem_array(key, value)
3117        else:
3118            # set column
3119            self._set_item(key, value)
3120
3121    def _setitem_slice(self, key, value):
3122        self._check_setitem_copy()
3123        self.loc._setitem_with_indexer(key, value)
3124
3125    def _setitem_array(self, key, value):
3126        # also raises Exception if object array with NA values
3127        if com.is_bool_indexer(key):
3128            if len(key) != len(self.index):
3129                raise ValueError('Item wrong length %d instead of %d!' %
3130                                 (len(key), len(self.index)))
3131            key = check_bool_indexer(self.index, key)
3132            indexer = key.nonzero()[0]
3133            self._check_setitem_copy()
3134            self.loc._setitem_with_indexer(indexer, value)
3135        else:
3136            if isinstance(value, DataFrame):
3137                if len(value.columns) != len(key):
3138                    raise ValueError('Columns must be same length as key')
3139                for k1, k2 in zip(key, value.columns):
3140                    self[k1] = value[k2]
3141            else:
3142                indexer = self.loc._convert_to_indexer(key, axis=1)
3143                self._check_setitem_copy()
3144                self.loc._setitem_with_indexer((slice(None), indexer), value)
3145
3146    def _setitem_frame(self, key, value):
3147        # support boolean setting with DataFrame input, e.g.
3148        # df[df > df2] = 0
3149        if isinstance(key, np.ndarray):
3150            if key.shape != self.shape:
3151                raise ValueError(
3152                    'Array conditional must be same shape as self'
3153                )
3154            key = self._constructor(key, **self._construct_axes_dict())
3155
3156        if key.values.size and not is_bool_dtype(key.values):
3157            raise TypeError(
3158                'Must pass DataFrame or 2-d ndarray with boolean values only'
3159            )
3160
3161        self._check_inplace_setting(value)
3162        self._check_setitem_copy()
3163        self._where(-key, value, inplace=True)
3164
3165    def _ensure_valid_index(self, value):
3166        """
3167        ensure that if we don't have an index, that we can create one from the
3168        passed value
3169        """
3170        # GH5632, make sure that we are a Series convertible
3171        if not len(self.index) and is_list_like(value):
3172            try:
3173                value = Series(value)
3174            except:
3175                raise ValueError('Cannot set a frame with no defined index '
3176                                 'and a value that cannot be converted to a '
3177                                 'Series')
3178
3179            self._data = self._data.reindex_axis(value.index.copy(), axis=1,
3180                                                 fill_value=np.nan)
3181
3182    def _set_item(self, key, value):
3183        """
3184        Add series to DataFrame in specified column.
3185
3186        If series is a numpy-array (not a Series/TimeSeries), it must be the
3187        same length as the DataFrames index or an error will be thrown.
3188
3189        Series/TimeSeries will be conformed to the DataFrames index to
3190        ensure homogeneity.
3191        """
3192
3193        self._ensure_valid_index(value)
3194        value = self._sanitize_column(key, value)
3195        NDFrame._set_item(self, key, value)
3196
3197        # check if we are modifying a copy
3198        # try to set first as we want an invalid
3199        # value exception to occur first
3200        if len(self):
3201            self._check_setitem_copy()
3202
3203    def insert(self, loc, column, value, allow_duplicates=False):
3204        """
3205        Insert column into DataFrame at specified location.
3206
3207        Raises a ValueError if `column` is already contained in the DataFrame,
3208        unless `allow_duplicates` is set to True.
3209
3210        Parameters
3211        ----------
3212        loc : int
3213            Insertion index. Must verify 0 <= loc <= len(columns)
3214        column : string, number, or hashable object
3215            label of the inserted column
3216        value : int, Series, or array-like
3217        allow_duplicates : bool, optional
3218        """
3219        self._ensure_valid_index(value)
3220        value = self._sanitize_column(column, value, broadcast=False)
3221        self._data.insert(loc, column, value,
3222                          allow_duplicates=allow_duplicates)
3223
3224    def assign(self, **kwargs):
3225        r"""
3226        Assign new columns to a DataFrame, returning a new object
3227        (a copy) with the new columns added to the original ones.
3228        Existing columns that are re-assigned will be overwritten.
3229
3230        Parameters
3231        ----------
3232        kwargs : keyword, value pairs
3233            keywords are the column names. If the values are
3234            callable, they are computed on the DataFrame and
3235            assigned to the new columns. The callable must not
3236            change input DataFrame (though pandas doesn't check it).
3237            If the values are not callable, (e.g. a Series, scalar, or array),
3238            they are simply assigned.
3239
3240        Returns
3241        -------
3242        df : DataFrame
3243            A new DataFrame with the new columns in addition to
3244            all the existing columns.
3245
3246        Notes
3247        -----
3248        Assigning multiple columns within the same ``assign`` is possible.
3249        For Python 3.6 and above, later items in '\*\*kwargs' may refer to
3250        newly created or modified columns in 'df'; items are computed and
3251        assigned into 'df' in order.  For Python 3.5 and below, the order of
3252        keyword arguments is not specified, you cannot refer to newly created
3253        or modified columns. All items are computed first, and then assigned
3254        in alphabetical order.
3255
3256        .. versionchanged :: 0.23.0
3257
3258            Keyword argument order is maintained for Python 3.6 and later.
3259
3260        Examples
3261        --------
3262        >>> df = pd.DataFrame({'A': range(1, 11), 'B': np.random.randn(10)})
3263
3264        Where the value is a callable, evaluated on `df`:
3265
3266        >>> df.assign(ln_A = lambda x: np.log(x.A))
3267            A         B      ln_A
3268        0   1  0.426905  0.000000
3269        1   2 -0.780949  0.693147
3270        2   3 -0.418711  1.098612
3271        3   4 -0.269708  1.386294
3272        4   5 -0.274002  1.609438
3273        5   6 -0.500792  1.791759
3274        6   7  1.649697  1.945910
3275        7   8 -1.495604  2.079442
3276        8   9  0.549296  2.197225
3277        9  10 -0.758542  2.302585
3278
3279        Where the value already exists and is inserted:
3280
3281        >>> newcol = np.log(df['A'])
3282        >>> df.assign(ln_A=newcol)
3283            A         B      ln_A
3284        0   1  0.426905  0.000000
3285        1   2 -0.780949  0.693147
3286        2   3 -0.418711  1.098612
3287        3   4 -0.269708  1.386294
3288        4   5 -0.274002  1.609438
3289        5   6 -0.500792  1.791759
3290        6   7  1.649697  1.945910
3291        7   8 -1.495604  2.079442
3292        8   9  0.549296  2.197225
3293        9  10 -0.758542  2.302585
3294
3295        Where the keyword arguments depend on each other
3296
3297        >>> df = pd.DataFrame({'A': [1, 2, 3]})
3298
3299        >>> df.assign(B=df.A, C=lambda x:x['A']+ x['B'])
3300            A  B  C
3301         0  1  1  2
3302         1  2  2  4
3303         2  3  3  6
3304        """
3305        data = self.copy()
3306
3307        # >= 3.6 preserve order of kwargs
3308        if PY36:
3309            for k, v in kwargs.items():
3310                data[k] = com._apply_if_callable(v, data)
3311        else:
3312            # <= 3.5: do all calculations first...
3313            results = OrderedDict()
3314            for k, v in kwargs.items():
3315                results[k] = com._apply_if_callable(v, data)
3316
3317            # <= 3.5 and earlier
3318            results = sorted(results.items())
3319            # ... and then assign
3320            for k, v in results:
3321                data[k] = v
3322        return data
3323
3324    def _sanitize_column(self, key, value, broadcast=True):
3325        """
3326        Ensures new columns (which go into the BlockManager as new blocks) are
3327        always copied and converted into an array.
3328
3329        Parameters
3330        ----------
3331        key : object
3332        value : scalar, Series, or array-like
3333        broadcast : bool, default True
3334            If ``key`` matches multiple duplicate column names in the
3335            DataFrame, this parameter indicates whether ``value`` should be
3336            tiled so that the returned array contains a (duplicated) column for
3337            each occurrence of the key. If False, ``value`` will not be tiled.
3338
3339        Returns
3340        -------
3341        sanitized_column : numpy-array
3342        """
3343
3344        def reindexer(value):
3345            # reindex if necessary
3346
3347            if value.index.equals(self.index) or not len(self.index):
3348                value = value._values.copy()
3349            else:
3350
3351                # GH 4107
3352                try:
3353                    value = value.reindex(self.index)._values
3354                except Exception as e:
3355
3356                    # duplicate axis
3357                    if not value.index.is_unique:
3358                        raise e
3359
3360                    # other
3361                    raise TypeError('incompatible index of inserted column '
3362                                    'with frame index')
3363            return value
3364
3365        if isinstance(value, Series):
3366            value = reindexer(value)
3367
3368        elif isinstance(value, DataFrame):
3369            # align right-hand-side columns if self.columns
3370            # is multi-index and self[key] is a sub-frame
3371            if isinstance(self.columns, MultiIndex) and key in self.columns:
3372                loc = self.columns.get_loc(key)
3373                if isinstance(loc, (slice, Series, np.ndarray, Index)):
3374                    cols = maybe_droplevels(self.columns[loc], key)
3375                    if len(cols) and not cols.equals(value.columns):
3376                        value = value.reindex(cols, axis=1)
3377            # now align rows
3378            value = reindexer(value).T
3379
3380        elif isinstance(value, ExtensionArray):
3381            from pandas.core.series import _sanitize_index
3382            # Explicitly copy here, instead of in _sanitize_index,
3383            # as sanitize_index won't copy an EA, even with copy=True
3384            value = value.copy()
3385            value = _sanitize_index(value, self.index, copy=False)
3386
3387        elif isinstance(value, Index) or is_sequence(value):
3388            from pandas.core.series import _sanitize_index
3389
3390            # turn me into an ndarray
3391            value = _sanitize_index(value, self.index, copy=False)
3392            if not isinstance(value, (np.ndarray, Index)):
3393                if isinstance(value, list) and len(value) > 0:
3394                    value = maybe_convert_platform(value)
3395                else:
3396                    value = com._asarray_tuplesafe(value)
3397            elif value.ndim == 2:
3398                value = value.copy().T
3399            elif isinstance(value, Index):
3400                value = value.copy(deep=True)
3401            else:
3402                value = value.copy()
3403
3404            # possibly infer to datetimelike
3405            if is_object_dtype(value.dtype):
3406                value = maybe_infer_to_datetimelike(value)
3407
3408        else:
3409            # upcast the scalar
3410            value = cast_scalar_to_array(len(self.index), value)
3411            value = maybe_cast_to_datetime(value, value.dtype)
3412
3413        # return internal types directly
3414        if is_extension_type(value) or is_extension_array_dtype(value):
3415            return value
3416
3417        # broadcast across multiple columns if necessary
3418        if broadcast and key in self.columns and value.ndim == 1:
3419            if (not self.columns.is_unique or
3420                    isinstance(self.columns, MultiIndex)):
3421                existing_piece = self[key]
3422                if isinstance(existing_piece, DataFrame):
3423                    value = np.tile(value, (len(existing_piece.columns), 1))
3424
3425        return np.atleast_2d(np.asarray(value))
3426
3427    @property
3428    def _series(self):
3429        result = {}
3430        for idx, item in enumerate(self.columns):
3431            result[item] = Series(self._data.iget(idx), index=self.index,
3432                                  name=item)
3433        return result
3434
3435    def lookup(self, row_labels, col_labels):
3436        """Label-based "fancy indexing" function for DataFrame.
3437        Given equal-length arrays of row and column labels, return an
3438        array of the values corresponding to each (row, col) pair.
3439
3440        Parameters
3441        ----------
3442        row_labels : sequence
3443            The row labels to use for lookup
3444        col_labels : sequence
3445            The column labels to use for lookup
3446
3447        Notes
3448        -----
3449        Akin to::
3450
3451            result = []
3452            for row, col in zip(row_labels, col_labels):
3453                result.append(df.get_value(row, col))
3454
3455        Examples
3456        --------
3457        values : ndarray
3458            The found values
3459
3460        """
3461        n = len(row_labels)
3462        if n != len(col_labels):
3463            raise ValueError('Row labels must have same size as column labels')
3464
3465        thresh = 1000
3466        if not self._is_mixed_type or n > thresh:
3467            values = self.values
3468            ridx = self.index.get_indexer(row_labels)
3469            cidx = self.columns.get_indexer(col_labels)
3470            if (ridx == -1).any():
3471                raise KeyError('One or more row labels was not found')
3472            if (cidx == -1).any():
3473                raise KeyError('One or more column labels was not found')
3474            flat_index = ridx * len(self.columns) + cidx
3475            result = values.flat[flat_index]
3476        else:
3477            result = np.empty(n, dtype='O')
3478            for i, (r, c) in enumerate(zip(row_labels, col_labels)):
3479                result[i] = self._get_value(r, c)
3480
3481        if is_object_dtype(result):
3482            result = lib.maybe_convert_objects(result)
3483
3484        return result
3485
3486    # ----------------------------------------------------------------------
3487    # Reindexing and alignment
3488
3489    def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
3490                      copy):
3491        frame = self
3492
3493        columns = axes['columns']
3494        if columns is not None:
3495            frame = frame._reindex_columns(columns, method, copy, level,
3496                                           fill_value, limit, tolerance)
3497
3498        index = axes['index']
3499        if index is not None:
3500            frame = frame._reindex_index(index, method, copy, level,
3501                                         fill_value, limit, tolerance)
3502
3503        return frame
3504
3505    def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan,
3506                       limit=None, tolerance=None):
3507        new_index, indexer = self.index.reindex(new_index, method=method,
3508                                                level=level, limit=limit,
3509                                                tolerance=tolerance)
3510        return self._reindex_with_indexers({0: [new_index, indexer]},
3511                                           copy=copy, fill_value=fill_value,
3512                                           allow_dups=False)
3513
3514    def _reindex_columns(self, new_columns, method, copy, level,
3515                         fill_value=None, limit=None, tolerance=None):
3516        new_columns, indexer = self.columns.reindex(new_columns, method=method,
3517                                                    level=level, limit=limit,
3518                                                    tolerance=tolerance)
3519        return self._reindex_with_indexers({1: [new_columns, indexer]},
3520                                           copy=copy, fill_value=fill_value,
3521                                           allow_dups=False)
3522
3523    def _reindex_multi(self, axes, copy, fill_value):
3524        """ we are guaranteed non-Nones in the axes! """
3525
3526        new_index, row_indexer = self.index.reindex(axes['index'])
3527        new_columns, col_indexer = self.columns.reindex(axes['columns'])
3528
3529        if row_indexer is not None and col_indexer is not None:
3530            indexer = row_indexer, col_indexer
3531            new_values = algorithms.take_2d_multi(self.values, indexer,
3532                                                  fill_value=fill_value)
3533            return self._constructor(new_values, index=new_index,
3534                                     columns=new_columns)
3535        else:
3536            return self._reindex_with_indexers({0: [new_index, row_indexer],
3537                                                1: [new_columns, col_indexer]},
3538                                               copy=copy,
3539                                               fill_value=fill_value)
3540
3541    @Appender(_shared_docs['align'] % _shared_doc_kwargs)
3542    def align(self, other, join='outer', axis=None, level=None, copy=True,
3543              fill_value=None, method=None, limit=None, fill_axis=0,
3544              broadcast_axis=None):
3545        return super(DataFrame, self).align(other, join=join, axis=axis,
3546                                            level=level, copy=copy,
3547                                            fill_value=fill_value,
3548                                            method=method, limit=limit,
3549                                            fill_axis=fill_axis,
3550                                            broadcast_axis=broadcast_axis)
3551
3552    @Appender(_shared_docs['reindex'] % _shared_doc_kwargs)
3553    @rewrite_axis_style_signature('labels', [('method', None),
3554                                             ('copy', True),
3555                                             ('level', None),
3556                                             ('fill_value', np.nan),
3557                                             ('limit', None),
3558                                             ('tolerance', None)])
3559    def reindex(self, *args, **kwargs):
3560        axes = validate_axis_style_args(self, args, kwargs, 'labels',
3561                                        'reindex')
3562        kwargs.update(axes)
3563        # Pop these, since the values are in `kwargs` under different names
3564        kwargs.pop('axis', None)
3565        kwargs.pop('labels', None)
3566        return super(DataFrame, self).reindex(**kwargs)
3567
3568    @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)
3569    def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True,
3570                     limit=None, fill_value=np.nan):
3571        return super(DataFrame,
3572                     self).reindex_axis(labels=labels, axis=axis,
3573                                        method=method, level=level, copy=copy,
3574                                        limit=limit, fill_value=fill_value)
3575
3576    def drop(self, labels=None, axis=0, index=None, columns=None,
3577             level=None, inplace=False, errors='raise'):
3578        """
3579        Drop specified labels from rows or columns.
3580
3581        Remove rows or columns by specifying label names and corresponding
3582        axis, or by specifying directly index or column names. When using a
3583        multi-index, labels on different levels can be removed by specifying
3584        the level.
3585
3586        Parameters
3587        ----------
3588        labels : single label or list-like
3589            Index or column labels to drop.
3590        axis : {0 or 'index', 1 or 'columns'}, default 0
3591            Whether to drop labels from the index (0 or 'index') or
3592            columns (1 or 'columns').
3593        index, columns : single label or list-like
3594            Alternative to specifying axis (``labels, axis=1``
3595            is equivalent to ``columns=labels``).
3596
3597            .. versionadded:: 0.21.0
3598        level : int or level name, optional
3599            For MultiIndex, level from which the labels will be removed.
3600        inplace : bool, default False
3601            If True, do operation inplace and return None.
3602        errors : {'ignore', 'raise'}, default 'raise'
3603            If 'ignore', suppress error and only existing labels are
3604            dropped.
3605
3606        Returns
3607        -------
3608        dropped : pandas.DataFrame
3609
3610        See Also
3611        --------
3612        DataFrame.loc : Label-location based indexer for selection by label.
3613        DataFrame.dropna : Return DataFrame with labels on given axis omitted
3614            where (all or any) data are missing
3615        DataFrame.drop_duplicates : Return DataFrame with duplicate rows
3616            removed, optionally only considering certain columns
3617        Series.drop : Return Series with specified index labels removed.
3618
3619        Raises
3620        ------
3621        KeyError
3622            If none of the labels are found in the selected axis
3623
3624        Examples
3625        --------
3626        >>> df = pd.DataFrame(np.arange(12).reshape(3,4),
3627        ...                   columns=['A', 'B', 'C', 'D'])
3628        >>> df
3629           A  B   C   D
3630        0  0  1   2   3
3631        1  4  5   6   7
3632        2  8  9  10  11
3633
3634        Drop columns
3635
3636        >>> df.drop(['B', 'C'], axis=1)
3637           A   D
3638        0  0   3
3639        1  4   7
3640        2  8  11
3641
3642        >>> df.drop(columns=['B', 'C'])
3643           A   D
3644        0  0   3
3645        1  4   7
3646        2  8  11
3647
3648        Drop a row by index
3649
3650        >>> df.drop([0, 1])
3651           A  B   C   D
3652        2  8  9  10  11
3653
3654        Drop columns and/or rows of MultiIndex DataFrame
3655
3656        >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
3657        ...                              ['speed', 'weight', 'length']],
3658        ...                      labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
3659        ...                              [0, 1, 2, 0, 1, 2, 0, 1, 2]])
3660        >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
3661        ...                   data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
3662        ...                         [250, 150], [1.5, 0.8], [320, 250],
3663        ...                         [1, 0.8], [0.3,0.2]])
3664        >>> df
3665                        big     small
3666        lama    speed   45.0    30.0
3667                weight  200.0   100.0
3668                length  1.5     1.0
3669        cow     speed   30.0    20.0
3670                weight  250.0   150.0
3671                length  1.5     0.8
3672        falcon  speed   320.0   250.0
3673                weight  1.0     0.8
3674                length  0.3     0.2
3675
3676        >>> df.drop(index='cow', columns='small')
3677                        big
3678        lama    speed   45.0
3679                weight  200.0
3680                length  1.5
3681        falcon  speed   320.0
3682                weight  1.0
3683                length  0.3
3684
3685        >>> df.drop(index='length', level=1)
3686                        big     small
3687        lama    speed   45.0    30.0
3688                weight  200.0   100.0
3689        cow     speed   30.0    20.0
3690                weight  250.0   150.0
3691        falcon  speed   320.0   250.0
3692                weight  1.0     0.8
3693        """
3694        return super(DataFrame, self).drop(labels=labels, axis=axis,
3695                                           index=index, columns=columns,
3696                                           level=level, inplace=inplace,
3697                                           errors=errors)
3698
3699    @rewrite_axis_style_signature('mapper', [('copy', True),
3700                                             ('inplace', False),
3701                                             ('level', None)])
3702    def rename(self, *args, **kwargs):
3703        """Alter axes labels.
3704
3705        Function / dict values must be unique (1-to-1). Labels not contained in
3706        a dict / Series will be left as-is. Extra labels listed don't throw an
3707        error.
3708
3709        See the :ref:`user guide <basics.rename>` for more.
3710
3711        Parameters
3712        ----------
3713        mapper, index, columns : dict-like or function, optional
3714            dict-like or functions transformations to apply to
3715            that axis' values. Use either ``mapper`` and ``axis`` to
3716            specify the axis to target with ``mapper``, or ``index`` and
3717            ``columns``.
3718        axis : int or str, optional
3719            Axis to target with ``mapper``. Can be either the axis name
3720            ('index', 'columns') or number (0, 1). The default is 'index'.
3721        copy : boolean, default True
3722            Also copy underlying data
3723        inplace : boolean, default False
3724            Whether to return a new DataFrame. If True then value of copy is
3725            ignored.
3726        level : int or level name, default None
3727            In case of a MultiIndex, only rename labels in the specified
3728            level.
3729
3730        Returns
3731        -------
3732        renamed : DataFrame
3733
3734        See Also
3735        --------
3736        pandas.DataFrame.rename_axis
3737
3738        Examples
3739        --------
3740
3741        ``DataFrame.rename`` supports two calling conventions
3742
3743        * ``(index=index_mapper, columns=columns_mapper, ...)``
3744        * ``(mapper, axis={'index', 'columns'}, ...)``
3745
3746        We *highly* recommend using keyword arguments to clarify your
3747        intent.
3748
3749        >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
3750        >>> df.rename(index=str, columns={"A": "a", "B": "c"})
3751           a  c
3752        0  1  4
3753        1  2  5
3754        2  3  6
3755
3756        >>> df.rename(index=str, columns={"A": "a", "C": "c"})
3757           a  B
3758        0  1  4
3759        1  2  5
3760        2  3  6
3761
3762        Using axis-style parameters
3763
3764        >>> df.rename(str.lower, axis='columns')
3765           a  b
3766        0  1  4
3767        1  2  5
3768        2  3  6
3769
3770        >>> df.rename({1: 2, 2: 4}, axis='index')
3771           A  B
3772        0  1  4
3773        2  2  5
3774        4  3  6
3775        """
3776        axes = validate_axis_style_args(self, args, kwargs, 'mapper', 'rename')
3777        kwargs.update(axes)
3778        # Pop these, since the values are in `kwargs` under different names
3779        kwargs.pop('axis', None)
3780        kwargs.pop('mapper', None)
3781        return super(DataFrame, self).rename(**kwargs)
3782
3783    @Substitution(**_shared_doc_kwargs)
3784    @Appender(NDFrame.fillna.__doc__)
3785    def fillna(self, value=None, method=None, axis=None, inplace=False,
3786               limit=None, downcast=None, **kwargs):
3787        return super(DataFrame,
3788                     self).fillna(value=value, method=method, axis=axis,
3789                                  inplace=inplace, limit=limit,
3790                                  downcast=downcast, **kwargs)
3791
3792    @Appender(_shared_docs['replace'] % _shared_doc_kwargs)
3793    def replace(self, to_replace=None, value=None, inplace=False, limit=None,
3794                regex=False, method='pad'):
3795        return super(DataFrame, self).replace(to_replace=to_replace,
3796                                              value=value, inplace=inplace,
3797                                              limit=limit, regex=regex,
3798                                              method=method)
3799
3800    @Appender(_shared_docs['shift'] % _shared_doc_kwargs)
3801    def shift(self, periods=1, freq=None, axis=0):
3802        return super(DataFrame, self).shift(periods=periods, freq=freq,
3803                                            axis=axis)
3804
3805    def set_index(self, keys, drop=True, append=False, inplace=False,
3806                  verify_integrity=False):
3807        """
3808        Set the DataFrame index (row labels) using one or more existing
3809        columns. By default yields a new object.
3810
3811        Parameters
3812        ----------
3813        keys : column label or list of column labels / arrays
3814        drop : boolean, default True
3815            Delete columns to be used as the new index
3816        append : boolean, default False
3817            Whether to append columns to existing index
3818        inplace : boolean, default False
3819            Modify the DataFrame in place (do not create a new object)
3820        verify_integrity : boolean, default False
3821            Check the new index for duplicates. Otherwise defer the check until
3822            necessary. Setting to False will improve the performance of this
3823            method
3824
3825        Examples
3826        --------
3827        >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
3828        ...                    'year': [2012, 2014, 2013, 2014],
3829        ...                    'sale':[55, 40, 84, 31]})
3830           month  sale  year
3831        0  1      55    2012
3832        1  4      40    2014
3833        2  7      84    2013
3834        3  10     31    2014
3835
3836        Set the index to become the 'month' column:
3837
3838        >>> df.set_index('month')
3839               sale  year
3840        month
3841        1      55    2012
3842        4      40    2014
3843        7      84    2013
3844        10     31    2014
3845
3846        Create a multi-index using columns 'year' and 'month':
3847
3848        >>> df.set_index(['year', 'month'])
3849                    sale
3850        year  month
3851        2012  1     55
3852        2014  4     40
3853        2013  7     84
3854        2014  10    31
3855
3856        Create a multi-index using a set of values and a column:
3857
3858        >>> df.set_index([[1, 2, 3, 4], 'year'])
3859                 month  sale
3860           year
3861        1  2012  1      55
3862        2  2014  4      40
3863        3  2013  7      84
3864        4  2014  10     31
3865
3866        Returns
3867        -------
3868        dataframe : DataFrame
3869        """
3870        inplace = validate_bool_kwarg(inplace, 'inplace')
3871        if not isinstance(keys, list):
3872            keys = [keys]
3873
3874        if inplace:
3875            frame = self
3876        else:
3877            frame = self.copy()
3878
3879        arrays = []
3880        names = []
3881        if append:
3882            names = [x for x in self.index.names]
3883            if isinstance(self.index, MultiIndex):
3884                for i in range(self.index.nlevels):
3885                    arrays.append(self.index._get_level_values(i))
3886            else:
3887                arrays.append(self.index)
3888
3889        to_remove = []
3890        for col in keys:
3891            if isinstance(col, MultiIndex):
3892                # append all but the last column so we don't have to modify
3893                # the end of this loop
3894                for n in range(col.nlevels - 1):
3895                    arrays.append(col._get_level_values(n))
3896
3897                level = col._get_level_values(col.nlevels - 1)
3898                names.extend(col.names)
3899            elif isinstance(col, Series):
3900                level = col._values
3901                names.append(col.name)
3902            elif isinstance(col, Index):
3903                level = col
3904                names.append(col.name)
3905            elif isinstance(col, (list, np.ndarray, Index)):
3906                level = col
3907                names.append(None)
3908            else:
3909                level = frame[col]._values
3910                names.append(col)
3911                if drop:
3912                    to_remove.append(col)
3913            arrays.append(level)
3914
3915        index = _ensure_index_from_sequences(arrays, names)
3916
3917        if verify_integrity and not index.is_unique:
3918            duplicates = index[index.duplicated()].unique()
3919            raise ValueError('Index has duplicate keys: {dup}'.format(
3920                dup=duplicates))
3921
3922        for c in to_remove:
3923            del frame[c]
3924
3925        # clear up memory usage
3926        index._cleanup()
3927
3928        frame.index = index
3929
3930        if not inplace:
3931            return frame
3932
3933    def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
3934                    col_fill=''):
3935        """
3936        For DataFrame with multi-level index, return new DataFrame with
3937        labeling information in the columns under the index names, defaulting
3938        to 'level_0', 'level_1', etc. if any are None. For a standard index,
3939        the index name will be used (if set), otherwise a default 'index' or
3940        'level_0' (if 'index' is already taken) will be used.
3941
3942        Parameters
3943        ----------
3944        level : int, str, tuple, or list, default None
3945            Only remove the given levels from the index. Removes all levels by
3946            default
3947        drop : boolean, default False
3948            Do not try to insert index into dataframe columns. This resets
3949            the index to the default integer index.
3950        inplace : boolean, default False
3951            Modify the DataFrame in place (do not create a new object)
3952        col_level : int or str, default 0
3953            If the columns have multiple levels, determines which level the
3954            labels are inserted into. By default it is inserted into the first
3955            level.
3956        col_fill : object, default ''
3957            If the columns have multiple levels, determines how the other
3958            levels are named. If None then the index name is repeated.
3959
3960        Returns
3961        -------
3962        resetted : DataFrame
3963
3964        Examples
3965        --------
3966        >>> df = pd.DataFrame([('bird',    389.0),
3967        ...                    ('bird',     24.0),
3968        ...                    ('mammal',   80.5),
3969        ...                    ('mammal', np.nan)],
3970        ...                   index=['falcon', 'parrot', 'lion', 'monkey'],
3971        ...                   columns=('class', 'max_speed'))
3972        >>> df
3973                 class  max_speed
3974        falcon    bird      389.0
3975        parrot    bird       24.0
3976        lion    mammal       80.5
3977        monkey  mammal        NaN
3978
3979        When we reset the index, the old index is added as a column, and a
3980        new sequential index is used:
3981
3982        >>> df.reset_index()
3983            index   class  max_speed
3984        0  falcon    bird      389.0
3985        1  parrot    bird       24.0
3986        2    lion  mammal       80.5
3987        3  monkey  mammal        NaN
3988
3989        We can use the `drop` parameter to avoid the old index being added as
3990        a column:
3991
3992        >>> df.reset_index(drop=True)
3993            class  max_speed
3994        0    bird      389.0
3995        1    bird       24.0
3996        2  mammal       80.5
3997        3  mammal        NaN
3998
3999        You can also use `reset_index` with `MultiIndex`.
4000
4001        >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
4002        ...                                    ('bird', 'parrot'),
4003        ...                                    ('mammal', 'lion'),
4004        ...                                    ('mammal', 'monkey')],
4005        ...                                   names=['class', 'name'])
4006        >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
4007        ...                                      ('species', 'type')])
4008        >>> df = pd.DataFrame([(389.0, 'fly'),
4009        ...                    ( 24.0, 'fly'),
4010        ...                    ( 80.5, 'run'),
4011        ...                    (np.nan, 'jump')],
4012        ...                   index=index,
4013        ...                   columns=columns)
4014        >>> df
4015                       speed species
4016                         max    type
4017        class  name
4018        bird   falcon  389.0     fly
4019               parrot   24.0     fly
4020        mammal lion     80.5     run
4021               monkey    NaN    jump
4022
4023        If the index has multiple levels, we can reset a subset of them:
4024
4025        >>> df.reset_index(level='class')
4026                 class  speed species
4027                          max    type
4028        name
4029        falcon    bird  389.0     fly
4030        parrot    bird   24.0     fly
4031        lion    mammal   80.5     run
4032        monkey  mammal    NaN    jump
4033
4034        If we are not dropping the index, by default, it is placed in the top
4035        level. We can place it in another level:
4036
4037        >>> df.reset_index(level='class', col_level=1)
4038                        speed species
4039                 class    max    type
4040        name
4041        falcon    bird  389.0     fly
4042        parrot    bird   24.0     fly
4043        lion    mammal   80.5     run
4044        monkey  mammal    NaN    jump
4045
4046        When the index is inserted under another level, we can specify under
4047        which one with the parameter `col_fill`:
4048
4049        >>> df.reset_index(level='class', col_level=1, col_fill='species')
4050                      species  speed species
4051                        class    max    type
4052        name
4053        falcon           bird  389.0     fly
4054        parrot           bird   24.0     fly
4055        lion           mammal   80.5     run
4056        monkey         mammal    NaN    jump
4057
4058        If we specify a nonexistent level for `col_fill`, it is created:
4059
4060        >>> df.reset_index(level='class', col_level=1, col_fill='genus')
4061                        genus  speed species
4062                        class    max    type
4063        name
4064        falcon           bird  389.0     fly
4065        parrot           bird   24.0     fly
4066        lion           mammal   80.5     run
4067        monkey         mammal    NaN    jump
4068        """
4069        inplace = validate_bool_kwarg(inplace, 'inplace')
4070        if inplace:
4071            new_obj = self
4072        else:
4073            new_obj = self.copy()
4074
4075        def _maybe_casted_values(index, labels=None):
4076            values = index._values
4077            if not isinstance(index, (PeriodIndex, DatetimeIndex)):
4078                if values.dtype == np.object_:
4079                    values = lib.maybe_convert_objects(values)
4080
4081            # if we have the labels, extract the values with a mask
4082            if labels is not None:
4083                mask = labels == -1
4084
4085                # we can have situations where the whole mask is -1,
4086                # meaning there is nothing found in labels, so make all nan's
4087                if mask.all():
4088                    values = np.empty(len(mask))
4089                    values.fill(np.nan)
4090                else:
4091                    values = values.take(labels)
4092                    if mask.any():
4093                        values, changed = maybe_upcast_putmask(
4094                            values, mask, np.nan)
4095            return values
4096
4097        new_index = com._default_index(len(new_obj))
4098        if level is not None:
4099            if not isinstance(level, (tuple, list)):
4100                level = [level]
4101            level = [self.index._get_level_number(lev) for lev in level]
4102            if isinstance(self.index, MultiIndex):
4103                if len(level) < self.index.nlevels:
4104                    new_index = self.index.droplevel(level)
4105
4106        if not drop:
4107            if isinstance(self.index, MultiIndex):
4108                names = [n if n is not None else ('level_%d' % i)
4109                         for (i, n) in enumerate(self.index.names)]
4110                to_insert = lzip(self.index.levels, self.index.labels)
4111            else:
4112                default = 'index' if 'index' not in self else 'level_0'
4113                names = ([default] if self.index.name is None
4114                         else [self.index.name])
4115                to_insert = ((self.index, None),)
4116
4117            multi_col = isinstance(self.columns, MultiIndex)
4118            for i, (lev, lab) in reversed(list(enumerate(to_insert))):
4119                if not (level is None or i in level):
4120                    continue
4121                name = names[i]
4122                if multi_col:
4123                    col_name = (list(name) if isinstance(name, tuple)
4124                                else [name])
4125                    if col_fill is None:
4126                        if len(col_name) not in (1, self.columns.nlevels):
4127                            raise ValueError("col_fill=None is incompatible "
4128                                             "with incomplete column name "
4129                                             "{}".format(name))
4130                        col_fill = col_name[0]
4131
4132                    lev_num = self.columns._get_level_number(col_level)
4133                    name_lst = [col_fill] * lev_num + col_name
4134                    missing = self.columns.nlevels - len(name_lst)
4135                    name_lst += [col_fill] * missing
4136                    name = tuple(name_lst)
4137                # to ndarray and maybe infer different dtype
4138                level_values = _maybe_casted_values(lev, lab)
4139                new_obj.insert(0, name, level_values)
4140
4141        new_obj.index = new_index
4142        if not inplace:
4143            return new_obj
4144
4145    # ----------------------------------------------------------------------
4146    # Reindex-based selection methods
4147
4148    @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
4149    def isna(self):
4150        return super(DataFrame, self).isna()
4151
4152    @Appender(_shared_docs['isna'] % _shared_doc_kwargs)
4153    def isnull(self):
4154        return super(DataFrame, self).isnull()
4155
4156    @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
4157    def notna(self):
4158        return super(DataFrame, self).notna()
4159
4160    @Appender(_shared_docs['notna'] % _shared_doc_kwargs)
4161    def notnull(self):
4162        return super(DataFrame, self).notnull()
4163
4164    def dropna(self, axis=0, how='any', thresh=None, subset=None,
4165               inplace=False):
4166        """
4167        Remove missing values.
4168
4169        See the :ref:`User Guide <missing_data>` for more on which values are
4170        considered missing, and how to work with missing data.
4171
4172        Parameters
4173        ----------
4174        axis : {0 or 'index', 1 or 'columns'}, default 0
4175            Determine if rows or columns which contain missing values are
4176            removed.
4177
4178            * 0, or 'index' : Drop rows which contain missing values.
4179            * 1, or 'columns' : Drop columns which contain missing value.
4180
4181            .. deprecated:: 0.23.0: Pass tuple or list to drop on multiple
4182            axes.
4183        how : {'any', 'all'}, default 'any'
4184            Determine if row or column is removed from DataFrame, when we have
4185            at least one NA or all NA.
4186
4187            * 'any' : If any NA values are present, drop that row or column.
4188            * 'all' : If all values are NA, drop that row or column.
4189        thresh : int, optional
4190            Require that many non-NA values.
4191        subset : array-like, optional
4192            Labels along other axis to consider, e.g. if you are dropping rows
4193            these would be a list of columns to include.
4194        inplace : bool, default False
4195            If True, do operation inplace and return None.
4196
4197        Returns
4198        -------
4199        DataFrame
4200            DataFrame with NA entries dropped from it.
4201
4202        See Also
4203        --------
4204        DataFrame.isna: Indicate missing values.
4205        DataFrame.notna : Indicate existing (non-missing) values.
4206        DataFrame.fillna : Replace missing values.
4207        Series.dropna : Drop missing values.
4208        Index.dropna : Drop missing indices.
4209
4210        Examples
4211        --------
4212        >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
4213        ...                    "toy": [np.nan, 'Batmobile', 'Bullwhip'],
4214        ...                    "born": [pd.NaT, pd.Timestamp("1940-04-25"),
4215        ...                             pd.NaT]})
4216        >>> df
4217               name        toy       born
4218        0    Alfred        NaN        NaT
4219        1    Batman  Batmobile 1940-04-25
4220        2  Catwoman   Bullwhip        NaT
4221
4222        Drop the rows where at least one element is missing.
4223
4224        >>> df.dropna()
4225             name        toy       born
4226        1  Batman  Batmobile 1940-04-25
4227
4228        Drop the columns where at least one element is missing.
4229
4230        >>> df.dropna(axis='columns')
4231               name
4232        0    Alfred
4233        1    Batman
4234        2  Catwoman
4235
4236        Drop the rows where all elements are missing.
4237
4238        >>> df.dropna(how='all')
4239               name        toy       born
4240        0    Alfred        NaN        NaT
4241        1    Batman  Batmobile 1940-04-25
4242        2  Catwoman   Bullwhip        NaT
4243
4244        Keep only the rows with at least 2 non-NA values.
4245
4246        >>> df.dropna(thresh=2)
4247               name        toy       born
4248        1    Batman  Batmobile 1940-04-25
4249        2  Catwoman   Bullwhip        NaT
4250
4251        Define in which columns to look for missing values.
4252
4253        >>> df.dropna(subset=['name', 'born'])
4254               name        toy       born
4255        1    Batman  Batmobile 1940-04-25
4256
4257        Keep the DataFrame with valid entries in the same variable.
4258
4259        >>> df.dropna(inplace=True)
4260        >>> df
4261             name        toy       born
4262        1  Batman  Batmobile 1940-04-25
4263        """
4264        inplace = validate_bool_kwarg(inplace, 'inplace')
4265        if isinstance(axis, (tuple, list)):
4266            # GH20987
4267            msg = ("supplying multiple axes to axis is deprecated and "
4268                   "will be removed in a future version.")
4269            warnings.warn(msg, FutureWarning, stacklevel=2)
4270
4271            result = self
4272            for ax in axis:
4273                result = result.dropna(how=how, thresh=thresh, subset=subset,
4274                                       axis=ax)
4275        else:
4276            axis = self._get_axis_number(axis)
4277            agg_axis = 1 - axis
4278
4279            agg_obj = self
4280            if subset is not None:
4281                ax = self._get_axis(agg_axis)
4282                indices = ax.get_indexer_for(subset)
4283                check = indices == -1
4284                if check.any():
4285                    raise KeyError(list(np.compress(check, subset)))
4286                agg_obj = self.take(indices, axis=agg_axis)
4287
4288            count = agg_obj.count(axis=agg_axis)
4289
4290            if thresh is not None:
4291                mask = count >= thresh
4292            elif how == 'any':
4293                mask = count == len(agg_obj._get_axis(agg_axis))
4294            elif how == 'all':
4295                mask = count > 0
4296            else:
4297                if how is not None:
4298                    raise ValueError('invalid how option: {h}'.format(h=how))
4299                else:
4300                    raise TypeError('must specify how or thresh')
4301
4302            result = self._take(mask.nonzero()[0], axis=axis)
4303
4304        if inplace:
4305            self._update_inplace(result)
4306        else:
4307            return result
4308
4309    def drop_duplicates(self, subset=None, keep='first', inplace=False):
4310        """
4311        Return DataFrame with duplicate rows removed, optionally only
4312        considering certain columns
4313
4314        Parameters
4315        ----------
4316        subset : column label or sequence of labels, optional
4317            Only consider certain columns for identifying duplicates, by
4318            default use all of the columns
4319        keep : {'first', 'last', False}, default 'first'
4320            - ``first`` : Drop duplicates except for the first occurrence.
4321            - ``last`` : Drop duplicates except for the last occurrence.
4322            - False : Drop all duplicates.
4323        inplace : boolean, default False
4324            Whether to drop duplicates in place or to return a copy
4325
4326        Returns
4327        -------
4328        deduplicated : DataFrame
4329        """
4330        inplace = validate_bool_kwarg(inplace, 'inplace')
4331        duplicated = self.duplicated(subset, keep=keep)
4332
4333        if inplace:
4334            inds, = (-duplicated).nonzero()
4335            new_data = self._data.take(inds)
4336            self._update_inplace(new_data)
4337        else:
4338            return self[-duplicated]
4339
4340    def duplicated(self, subset=None, keep='first'):
4341        """
4342        Return boolean Series denoting duplicate rows, optionally only
4343        considering certain columns
4344
4345        Parameters
4346        ----------
4347        subset : column label or sequence of labels, optional
4348            Only consider certain columns for identifying duplicates, by
4349            default use all of the columns
4350        keep : {'first', 'last', False}, default 'first'
4351            - ``first`` : Mark duplicates as ``True`` except for the
4352              first occurrence.
4353            - ``last`` : Mark duplicates as ``True`` except for the
4354              last occurrence.
4355            - False : Mark all duplicates as ``True``.
4356
4357        Returns
4358        -------
4359        duplicated : Series
4360        """
4361        from pandas.core.sorting import get_group_index
4362        from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
4363
4364        def f(vals):
4365            labels, shape = algorithms.factorize(
4366                vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
4367            return labels.astype('i8', copy=False), len(shape)
4368
4369        if subset is None:
4370            subset = self.columns
4371        elif (not np.iterable(subset) or
4372              isinstance(subset, compat.string_types) or
4373              isinstance(subset, tuple) and subset in self.columns):
4374            subset = subset,
4375
4376        # Verify all columns in subset exist in the queried dataframe
4377        # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
4378        # key that doesn't exist.
4379        diff = Index(subset).difference(self.columns)
4380        if not diff.empty:
4381            raise KeyError(diff)
4382
4383        vals = (col.values for name, col in self.iteritems()
4384                if name in subset)
4385        labels, shape = map(list, zip(*map(f, vals)))
4386
4387        ids = get_group_index(labels, shape, sort=False, xnull=False)
4388        return Series(duplicated_int64(ids, keep), index=self.index)
4389
4390    # ----------------------------------------------------------------------
4391    # Sorting
4392
4393    @Appender(_shared_docs['sort_values'] % _shared_doc_kwargs)
4394    def sort_values(self, by, axis=0, ascending=True, inplace=False,
4395                    kind='quicksort', na_position='last'):
4396        inplace = validate_bool_kwarg(inplace, 'inplace')
4397        axis = self._get_axis_number(axis)
4398        stacklevel = 2  # Number of stack levels from df.sort_values
4399
4400        if not isinstance(by, list):
4401            by = [by]
4402        if is_sequence(ascending) and len(by) != len(ascending):
4403            raise ValueError('Length of ascending (%d) != length of by (%d)' %
4404                             (len(ascending), len(by)))
4405        if len(by) > 1:
4406            from pandas.core.sorting import lexsort_indexer
4407
4408            keys = []
4409            for x in by:
4410                k = self._get_label_or_level_values(x, axis=axis,
4411                                                    stacklevel=stacklevel)
4412                keys.append(k)
4413            indexer = lexsort_indexer(keys, orders=ascending,
4414                                      na_position=na_position)
4415            indexer = _ensure_platform_int(indexer)
4416        else:
4417            from pandas.core.sorting import nargsort
4418
4419            by = by[0]
4420            k = self._get_label_or_level_values(by, axis=axis,
4421                                                stacklevel=stacklevel)
4422
4423            if isinstance(ascending, (tuple, list)):
4424                ascending = ascending[0]
4425
4426            indexer = nargsort(k, kind=kind, ascending=ascending,
4427                               na_position=na_position)
4428
4429        new_data = self._data.take(indexer,
4430                                   axis=self._get_block_manager_axis(axis),
4431                                   verify=False)
4432
4433        if inplace:
4434            return self._update_inplace(new_data)
4435        else:
4436            return self._constructor(new_data).__finalize__(self)
4437
4438    @Appender(_shared_docs['sort_index'] % _shared_doc_kwargs)
4439    def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
4440                   kind='quicksort', na_position='last', sort_remaining=True,
4441                   by=None):
4442
4443        # TODO: this can be combined with Series.sort_index impl as
4444        # almost identical
4445
4446        inplace = validate_bool_kwarg(inplace, 'inplace')
4447        # 10726
4448        if by is not None:
4449            warnings.warn("by argument to sort_index is deprecated, "
4450                          "please use .sort_values(by=...)",
4451                          FutureWarning, stacklevel=2)
4452            if level is not None:
4453                raise ValueError("unable to simultaneously sort by and level")
4454            return self.sort_values(by, axis=axis, ascending=ascending,
4455                                    inplace=inplace)
4456
4457        axis = self._get_axis_number(axis)
4458        labels = self._get_axis(axis)
4459
4460        # make sure that the axis is lexsorted to start
4461        # if not we need to reconstruct to get the correct indexer
4462        labels = labels._sort_levels_monotonic()
4463        if level is not None:
4464
4465            new_axis, indexer = labels.sortlevel(level, ascending=ascending,
4466                                                 sort_remaining=sort_remaining)
4467
4468        elif isinstance(labels, MultiIndex):
4469            from pandas.core.sorting import lexsort_indexer
4470
4471            indexer = lexsort_indexer(labels._get_labels_for_sorting(),
4472                                      orders=ascending,
4473                                      na_position=na_position)
4474        else:
4475            from pandas.core.sorting import nargsort
4476
4477            # Check monotonic-ness before sort an index
4478            # GH11080
4479            if ((ascending and labels.is_monotonic_increasing) or
4480                    (not ascending and labels.is_monotonic_decreasing)):
4481                if inplace:
4482                    return
4483                else:
4484                    return self.copy()
4485
4486            indexer = nargsort(labels, kind=kind, ascending=ascending,
4487                               na_position=na_position)
4488
4489        baxis = self._get_block_manager_axis(axis)
4490        new_data = self._data.take(indexer,
4491                                   axis=baxis,
4492                                   verify=False)
4493
4494        # reconstruct axis if needed
4495        new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic()
4496
4497        if inplace:
4498            return self._update_inplace(new_data)
4499        else:
4500            return self._constructor(new_data).__finalize__(self)
4501
4502    def sortlevel(self, level=0, axis=0, ascending=True, inplace=False,
4503                  sort_remaining=True):
4504        """Sort multilevel index by chosen axis and primary level. Data will be
4505        lexicographically sorted by the chosen level followed by the other
4506        levels (in order).
4507
4508        .. deprecated:: 0.20.0
4509            Use :meth:`DataFrame.sort_index`
4510
4511
4512        Parameters
4513        ----------
4514        level : int
4515        axis : {0 or 'index', 1 or 'columns'}, default 0
4516        ascending : boolean, default True
4517        inplace : boolean, default False
4518            Sort the DataFrame without creating a new instance
4519        sort_remaining : boolean, default True
4520            Sort by the other levels too.
4521
4522        Returns
4523        -------
4524        sorted : DataFrame
4525
4526        See Also
4527        --------
4528        DataFrame.sort_index(level=...)
4529
4530        """
4531        warnings.warn("sortlevel is deprecated, use sort_index(level= ...)",
4532                      FutureWarning, stacklevel=2)
4533        return self.sort_index(level=level, axis=axis, ascending=ascending,
4534                               inplace=inplace, sort_remaining=sort_remaining)
4535
4536    def nlargest(self, n, columns, keep='first'):
4537        """
4538        Return the first `n` rows ordered by `columns` in descending order.
4539
4540        Return the first `n` rows with the largest values in `columns`, in
4541        descending order. The columns that are not specified are returned as
4542        well, but not used for ordering.
4543
4544        This method is equivalent to
4545        ``df.sort_values(columns, ascending=False).head(n)``, but more
4546        performant.
4547
4548        Parameters
4549        ----------
4550        n : int
4551            Number of rows to return.
4552        columns : label or list of labels
4553            Column label(s) to order by.
4554        keep : {'first', 'last'}, default 'first'
4555            Where there are duplicate values:
4556
4557            - `first` : prioritize the first occurrence(s)
4558            - `last` : prioritize the last occurrence(s)
4559
4560        Returns
4561        -------
4562        DataFrame
4563            The first `n` rows ordered by the given columns in descending
4564            order.
4565
4566        See Also
4567        --------
4568        DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
4569            ascending order.
4570        DataFrame.sort_values : Sort DataFrame by the values
4571        DataFrame.head : Return the first `n` rows without re-ordering.
4572
4573        Notes
4574        -----
4575        This function cannot be used with all column types. For example, when
4576        specifying columns with `object` or `category` dtypes, ``TypeError`` is
4577        raised.
4578
4579        Examples
4580        --------
4581        >>> df = pd.DataFrame({'a': [1, 10, 8, 10, -1],
4582        ...                    'b': list('abdce'),
4583        ...                    'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
4584        >>> df
4585            a  b    c
4586        0   1  a  1.0
4587        1  10  b  2.0
4588        2   8  d  NaN
4589        3  10  c  3.0
4590        4  -1  e  4.0
4591
4592        In the following example, we will use ``nlargest`` to select the three
4593        rows having the largest values in column "a".
4594
4595        >>> df.nlargest(3, 'a')
4596            a  b    c
4597        1  10  b  2.0
4598        3  10  c  3.0
4599        2   8  d  NaN
4600
4601        When using ``keep='last'``, ties are resolved in reverse order:
4602
4603        >>> df.nlargest(3, 'a', keep='last')
4604            a  b    c
4605        3  10  c  3.0
4606        1  10  b  2.0
4607        2   8  d  NaN
4608
4609        To order by the largest values in column "a" and then "c", we can
4610        specify multiple columns like in the next example.
4611
4612        >>> df.nlargest(3, ['a', 'c'])
4613            a  b    c
4614        3  10  c  3.0
4615        1  10  b  2.0
4616        2   8  d  NaN
4617
4618        Attempting to use ``nlargest`` on non-numeric dtypes will raise a
4619        ``TypeError``:
4620
4621        >>> df.nlargest(3, 'b')
4622        Traceback (most recent call last):
4623        TypeError: Column 'b' has dtype object, cannot use method 'nlargest'
4624        """
4625        return algorithms.SelectNFrame(self,
4626                                       n=n,
4627                                       keep=keep,
4628                                       columns=columns).nlargest()
4629
4630    def nsmallest(self, n, columns, keep='first'):
4631        """Get the rows of a DataFrame sorted by the `n` smallest
4632        values of `columns`.
4633
4634        Parameters
4635        ----------
4636        n : int
4637            Number of items to retrieve
4638        columns : list or str
4639            Column name or names to order by
4640        keep : {'first', 'last'}, default 'first'
4641            Where there are duplicate values:
4642            - ``first`` : take the first occurrence.
4643            - ``last`` : take the last occurrence.
4644
4645        Returns
4646        -------
4647        DataFrame
4648
4649        Examples
4650        --------
4651        >>> df = pd.DataFrame({'a': [1, 10, 8, 11, -1],
4652        ...                    'b': list('abdce'),
4653        ...                    'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
4654        >>> df.nsmallest(3, 'a')
4655           a  b   c
4656        4 -1  e   4
4657        0  1  a   1
4658        2  8  d NaN
4659        """
4660        return algorithms.SelectNFrame(self,
4661                                       n=n,
4662                                       keep=keep,
4663                                       columns=columns).nsmallest()
4664
4665    def swaplevel(self, i=-2, j=-1, axis=0):
4666        """
4667        Swap levels i and j in a MultiIndex on a particular axis
4668
4669        Parameters
4670        ----------
4671        i, j : int, string (can be mixed)
4672            Level of index to be swapped. Can pass level name as string.
4673
4674        Returns
4675        -------
4676        swapped : type of caller (new object)
4677
4678        .. versionchanged:: 0.18.1
4679
4680           The indexes ``i`` and ``j`` are now optional, and default to
4681           the two innermost levels of the index.
4682
4683        """
4684        result = self.copy()
4685
4686        axis = self._get_axis_number(axis)
4687        if axis == 0:
4688            result.index = result.index.swaplevel(i, j)
4689        else:
4690            result.columns = result.columns.swaplevel(i, j)
4691        return result
4692
4693    def reorder_levels(self, order, axis=0):
4694        """
4695        Rearrange index levels using input order.
4696        May not drop or duplicate levels
4697
4698        Parameters
4699        ----------
4700        order : list of int or list of str
4701            List representing new level order. Reference level by number
4702            (position) or by key (label).
4703        axis : int
4704            Where to reorder levels.
4705
4706        Returns
4707        -------
4708        type of caller (new object)
4709        """
4710        axis = self._get_axis_number(axis)
4711        if not isinstance(self._get_axis(axis),
4712                          MultiIndex):  # pragma: no cover
4713            raise TypeError('Can only reorder levels on a hierarchical axis.')
4714
4715        result = self.copy()
4716
4717        if axis == 0:
4718            result.index = result.index.reorder_levels(order)
4719        else:
4720            result.columns = result.columns.reorder_levels(order)
4721        return result
4722
4723    # ----------------------------------------------------------------------
4724    # Arithmetic / combination related
4725
4726    def _combine_frame(self, other, func, fill_value=None, level=None):
4727        this, other = self.align(other, join='outer', level=level, copy=False)
4728        new_index, new_columns = this.index, this.columns
4729
4730        def _arith_op(left, right):
4731            # for the mixed_type case where we iterate over columns,
4732            # _arith_op(left, right) is equivalent to
4733            # left._binop(right, func, fill_value=fill_value)
4734            left, right = ops.fill_binop(left, right, fill_value)
4735            return func(left, right)
4736
4737        if this._is_mixed_type or other._is_mixed_type:
4738            # iterate over columns
4739            if this.columns.is_unique:
4740                # unique columns
4741                result = {col: _arith_op(this[col], other[col])
4742                          for col in this}
4743                result = self._constructor(result, index=new_index,
4744                                           columns=new_columns, copy=False)
4745            else:
4746                # non-unique columns
4747                result = {i: _arith_op(this.iloc[:, i], other.iloc[:, i])
4748                          for i, col in enumerate(this.columns)}
4749                result = self._constructor(result, index=new_index, copy=False)
4750                result.columns = new_columns
4751            return result
4752
4753        else:
4754            result = _arith_op(this.values, other.values)
4755
4756        return self._constructor(result, index=new_index, columns=new_columns,
4757                                 copy=False)
4758
4759    def _combine_match_index(self, other, func, level=None):
4760        left, right = self.align(other, join='outer', axis=0, level=level,
4761                                 copy=False)
4762        new_data = func(left.values.T, right.values).T
4763        return self._constructor(new_data,
4764                                 index=left.index, columns=self.columns,
4765                                 copy=False)
4766
4767    def _combine_match_columns(self, other, func, level=None, try_cast=True):
4768        left, right = self.align(other, join='outer', axis=1, level=level,
4769                                 copy=False)
4770
4771        new_data = left._data.eval(func=func, other=right,
4772                                   axes=[left.columns, self.index],
4773                                   try_cast=try_cast)
4774        return self._constructor(new_data)
4775
4776    def _combine_const(self, other, func, errors='raise', try_cast=True):
4777        new_data = self._data.eval(func=func, other=other,
4778                                   errors=errors,
4779                                   try_cast=try_cast)
4780        return self._constructor(new_data)
4781
4782    def _compare_frame(self, other, func, str_rep):
4783        # compare_frame assumes self._indexed_same(other)
4784
4785        import pandas.core.computation.expressions as expressions
4786        # unique
4787        if self.columns.is_unique:
4788
4789            def _compare(a, b):
4790                return {col: func(a[col], b[col]) for col in a.columns}
4791
4792            new_data = expressions.evaluate(_compare, str_rep, self, other)
4793            return self._constructor(data=new_data, index=self.index,
4794                                     columns=self.columns, copy=False)
4795        # non-unique
4796        else:
4797
4798            def _compare(a, b):
4799                return {i: func(a.iloc[:, i], b.iloc[:, i])
4800                        for i, col in enumerate(a.columns)}
4801
4802            new_data = expressions.evaluate(_compare, str_rep, self, other)
4803            result = self._constructor(data=new_data, index=self.index,
4804                                       copy=False)
4805            result.columns = self.columns
4806            return result
4807
4808    def combine(self, other, func, fill_value=None, overwrite=True):
4809        """
4810        Add two DataFrame objects and do not propagate NaN values, so if for a
4811        (column, time) one frame is missing a value, it will default to the
4812        other frame's value (which might be NaN as well)
4813
4814        Parameters
4815        ----------
4816        other : DataFrame
4817        func : function
4818            Function that takes two series as inputs and return a Series or a
4819            scalar
4820        fill_value : scalar value
4821        overwrite : boolean, default True
4822            If True then overwrite values for common keys in the calling frame
4823
4824        Returns
4825        -------
4826        result : DataFrame
4827
4828        Examples
4829        --------
4830        >>> df1 = DataFrame({'A': [0, 0], 'B': [4, 4]})
4831        >>> df2 = DataFrame({'A': [1, 1], 'B': [3, 3]})
4832        >>> df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2)
4833           A  B
4834        0  0  3
4835        1  0  3
4836
4837        See Also
4838        --------
4839        DataFrame.combine_first : Combine two DataFrame objects and default to
4840            non-null values in frame calling the method
4841        """
4842        other_idxlen = len(other.index)  # save for compare
4843
4844        this, other = self.align(other, copy=False)
4845        new_index = this.index
4846
4847        if other.empty and len(new_index) == len(self.index):
4848            return self.copy()
4849
4850        if self.empty and len(other) == other_idxlen:
4851            return other.copy()
4852
4853        # sorts if possible
4854        new_columns = this.columns.union(other.columns)
4855        do_fill = fill_value is not None
4856
4857        result = {}
4858        for col in new_columns:
4859            series = this[col]
4860            otherSeries = other[col]
4861
4862            this_dtype = series.dtype
4863            other_dtype = otherSeries.dtype
4864
4865            this_mask = isna(series)
4866            other_mask = isna(otherSeries)
4867
4868            # don't overwrite columns unecessarily
4869            # DO propagate if this column is not in the intersection
4870            if not overwrite and other_mask.all():
4871                result[col] = this[col].copy()
4872                continue
4873
4874            if do_fill:
4875                series = series.copy()
4876                otherSeries = otherSeries.copy()
4877                series[this_mask] = fill_value
4878                otherSeries[other_mask] = fill_value
4879
4880            # if we have different dtypes, possibly promote
4881            new_dtype = this_dtype
4882            if not is_dtype_equal(this_dtype, other_dtype):
4883                new_dtype = find_common_type([this_dtype, other_dtype])
4884                if not is_dtype_equal(this_dtype, new_dtype):
4885                    series = series.astype(new_dtype)
4886                if not is_dtype_equal(other_dtype, new_dtype):
4887                    otherSeries = otherSeries.astype(new_dtype)
4888
4889            # see if we need to be represented as i8 (datetimelike)
4890            # try to keep us at this dtype
4891            needs_i8_conversion_i = needs_i8_conversion(new_dtype)
4892            if needs_i8_conversion_i:
4893                arr = func(series, otherSeries, True)
4894            else:
4895                arr = func(series, otherSeries)
4896
4897            arr = maybe_downcast_to_dtype(arr, this_dtype)
4898
4899            result[col] = arr
4900
4901        # convert_objects just in case
4902        return self._constructor(result, index=new_index,
4903                                 columns=new_columns)._convert(datetime=True,
4904                                                               copy=False)
4905
4906    def combine_first(self, other):
4907        """
4908        Combine two DataFrame objects and default to non-null values in frame
4909        calling the method. Result index columns will be the union of the
4910        respective indexes and columns
4911
4912        Parameters
4913        ----------
4914        other : DataFrame
4915
4916        Returns
4917        -------
4918        combined : DataFrame
4919
4920        Examples
4921        --------
4922        df1's values prioritized, use values from df2 to fill holes:
4923
4924        >>> df1 = pd.DataFrame([[1, np.nan]])
4925        >>> df2 = pd.DataFrame([[3, 4]])
4926        >>> df1.combine_first(df2)
4927           0    1
4928        0  1  4.0
4929
4930        See Also
4931        --------
4932        DataFrame.combine : Perform series-wise operation on two DataFrames
4933            using a given function
4934        """
4935        import pandas.core.computation.expressions as expressions
4936
4937        def combiner(x, y, needs_i8_conversion=False):
4938            x_values = x.values if hasattr(x, 'values') else x
4939            y_values = y.values if hasattr(y, 'values') else y
4940            if needs_i8_conversion:
4941                mask = isna(x)
4942                x_values = x_values.view('i8')
4943                y_values = y_values.view('i8')
4944            else:
4945                mask = isna(x_values)
4946
4947            return expressions.where(mask, y_values, x_values)
4948
4949        return self.combine(other, combiner, overwrite=False)
4950
4951    def update(self, other, join='left', overwrite=True, filter_func=None,
4952               raise_conflict=False):
4953        """
4954        Modify in place using non-NA values from another DataFrame.
4955
4956        Aligns on indices. There is no return value.
4957
4958        Parameters
4959        ----------
4960        other : DataFrame, or object coercible into a DataFrame
4961            Should have at least one matching index/column label
4962            with the original DataFrame. If a Series is passed,
4963            its name attribute must be set, and that will be
4964            used as the column name to align with the original DataFrame.
4965        join : {'left'}, default 'left'
4966            Only left join is implemented, keeping the index and columns of the
4967            original object.
4968        overwrite : bool, default True
4969            How to handle non-NA values for overlapping keys:
4970
4971            * True: overwrite original DataFrame's values
4972              with values from `other`.
4973            * False: only update values that are NA in
4974              the original DataFrame.
4975
4976        filter_func : callable(1d-array) -> boolean 1d-array, optional
4977            Can choose to replace values other than NA. Return True for values
4978            that should be updated.
4979        raise_conflict : bool, default False
4980            If True, will raise a ValueError if the DataFrame and `other`
4981            both contain non-NA data in the same place.
4982
4983        Raises
4984        ------
4985        ValueError
4986            When `raise_conflict` is True and there's overlapping non-NA data.
4987
4988        See Also
4989        --------
4990        dict.update : Similar method for dictionaries.
4991        DataFrame.merge : For column(s)-on-columns(s) operations.
4992
4993        Examples
4994        --------
4995        >>> df = pd.DataFrame({'A': [1, 2, 3],
4996        ...                    'B': [400, 500, 600]})
4997        >>> new_df = pd.DataFrame({'B': [4, 5, 6],
4998        ...                        'C': [7, 8, 9]})
4999        >>> df.update(new_df)
5000        >>> df
5001           A  B
5002        0  1  4
5003        1  2  5
5004        2  3  6
5005
5006        The DataFrame's length does not increase as a result of the update,
5007        only values at matching index/column labels are updated.
5008
5009        >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5010        ...                    'B': ['x', 'y', 'z']})
5011        >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
5012        >>> df.update(new_df)
5013        >>> df
5014           A  B
5015        0  a  d
5016        1  b  e
5017        2  c  f
5018
5019        For Series, it's name attribute must be set.
5020
5021        >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5022        ...                    'B': ['x', 'y', 'z']})
5023        >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
5024        >>> df.update(new_column)
5025        >>> df
5026           A  B
5027        0  a  d
5028        1  b  y
5029        2  c  e
5030        >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
5031        ...                    'B': ['x', 'y', 'z']})
5032        >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
5033        >>> df.update(new_df)
5034        >>> df
5035           A  B
5036        0  a  x
5037        1  b  d
5038        2  c  e
5039
5040        If `other` contains NaNs the corresponding values are not updated
5041        in the original dataframe.
5042
5043        >>> df = pd.DataFrame({'A': [1, 2, 3],
5044        ...                    'B': [400, 500, 600]})
5045        >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
5046        >>> df.update(new_df)
5047        >>> df
5048           A      B
5049        0  1    4.0
5050        1  2  500.0
5051        2  3    6.0
5052        """
5053        import pandas.core.computation.expressions as expressions
5054        # TODO: Support other joins
5055        if join != 'left':  # pragma: no cover
5056            raise NotImplementedError("Only left join is supported")
5057
5058        if not isinstance(other, DataFrame):
5059            other = DataFrame(other)
5060
5061        other = other.reindex_like(self)
5062
5063        for col in self.columns:
5064            this = self[col].values
5065            that = other[col].values
5066            if filter_func is not None:
5067                with np.errstate(all='ignore'):
5068                    mask = ~filter_func(this) | isna(that)
5069            else:
5070                if raise_conflict:
5071                    mask_this = notna(that)
5072                    mask_that = notna(this)
5073                    if any(mask_this & mask_that):
5074                        raise ValueError("Data overlaps.")
5075
5076                if overwrite:
5077                    mask = isna(that)
5078                else:
5079                    mask = notna(this)
5080
5081            # don't overwrite columns unecessarily
5082            if mask.all():
5083                continue
5084
5085            self[col] = expressions.where(mask, this, that)
5086
5087    # ----------------------------------------------------------------------
5088    # Data reshaping
5089
5090    def pivot(self, index=None, columns=None, values=None):
5091        """
5092        Return reshaped DataFrame organized by given index / column values.
5093
5094        Reshape data (produce a "pivot" table) based on column values. Uses
5095        unique values from specified `index` / `columns` to form axes of the
5096        resulting DataFrame. This function does not support data
5097        aggregation, multiple values will result in a MultiIndex in the
5098        columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
5099
5100        Parameters
5101        ----------
5102        index : string or object, optional
5103            Column to use to make new frame's index. If None, uses
5104            existing index.
5105        columns : string or object
5106            Column to use to make new frame's columns.
5107        values : string, object or a list of the previous, optional
5108            Column(s) to use for populating new frame's values. If not
5109            specified, all remaining columns will be used and the result will
5110            have hierarchically indexed columns.
5111
5112            .. versionchanged :: 0.23.0
5113               Also accept list of column names.
5114
5115        Returns
5116        -------
5117        DataFrame
5118            Returns reshaped DataFrame.
5119
5120        Raises
5121        ------
5122        ValueError:
5123            When there are any `index`, `columns` combinations with multiple
5124            values. `DataFrame.pivot_table` when you need to aggregate.
5125
5126        See Also
5127        --------
5128        DataFrame.pivot_table : generalization of pivot that can handle
5129            duplicate values for one index/column pair.
5130        DataFrame.unstack : pivot based on the index values instead of a
5131            column.
5132
5133        Notes
5134        -----
5135        For finer-tuned control, see hierarchical indexing documentation along
5136        with the related stack/unstack methods.
5137
5138        Examples
5139        --------
5140        >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
5141        ...                            'two'],
5142        ...                    'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
5143        ...                    'baz': [1, 2, 3, 4, 5, 6],
5144        ...                    'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
5145        >>> df
5146            foo   bar  baz  zoo
5147        0   one   A    1    x
5148        1   one   B    2    y
5149        2   one   C    3    z
5150        3   two   A    4    q
5151        4   two   B    5    w
5152        5   two   C    6    t
5153
5154        >>> df.pivot(index='foo', columns='bar', values='baz')
5155        bar  A   B   C
5156        foo
5157        one  1   2   3
5158        two  4   5   6
5159
5160        >>> df.pivot(index='foo', columns='bar')['baz']
5161        bar  A   B   C
5162        foo
5163        one  1   2   3
5164        two  4   5   6
5165
5166        >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
5167              baz       zoo
5168        bar   A  B  C   A  B  C
5169        foo
5170        one   1  2  3   x  y  z
5171        two   4  5  6   q  w  t
5172
5173        A ValueError is raised if there are any duplicates.
5174
5175        >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
5176        ...                    "bar": ['A', 'A', 'B', 'C'],
5177        ...                    "baz": [1, 2, 3, 4]})
5178        >>> df
5179           foo bar  baz
5180        0  one   A    1
5181        1  one   A    2
5182        2  two   B    3
5183        3  two   C    4
5184
5185        Notice that the first two rows are the same for our `index`
5186        and `columns` arguments.
5187
5188        >>> df.pivot(index='foo', columns='bar', values='baz')
5189        Traceback (most recent call last):
5190           ...
5191        ValueError: Index contains duplicate entries, cannot reshape
5192        """
5193        from pandas.core.reshape.reshape import pivot
5194        return pivot(self, index=index, columns=columns, values=values)
5195
5196    _shared_docs['pivot_table'] = """
5197        Create a spreadsheet-style pivot table as a DataFrame. The levels in
5198        the pivot table will be stored in MultiIndex objects (hierarchical
5199        indexes) on the index and columns of the result DataFrame
5200
5201        Parameters
5202        ----------%s
5203        values : column to aggregate, optional
5204        index : column, Grouper, array, or list of the previous
5205            If an array is passed, it must be the same length as the data. The
5206            list can contain any of the other types (except list).
5207            Keys to group by on the pivot table index.  If an array is passed,
5208            it is being used as the same manner as column values.
5209        columns : column, Grouper, array, or list of the previous
5210            If an array is passed, it must be the same length as the data. The
5211            list can contain any of the other types (except list).
5212            Keys to group by on the pivot table column.  If an array is passed,
5213            it is being used as the same manner as column values.
5214        aggfunc : function, list of functions, dict, default numpy.mean
5215            If list of functions passed, the resulting pivot table will have
5216            hierarchical columns whose top level are the function names
5217            (inferred from the function objects themselves)
5218            If dict is passed, the key is column to aggregate and value
5219            is function or list of functions
5220        fill_value : scalar, default None
5221            Value to replace missing values with
5222        margins : boolean, default False
5223            Add all row / columns (e.g. for subtotal / grand totals)
5224        dropna : boolean, default True
5225            Do not include columns whose entries are all NaN
5226        margins_name : string, default 'All'
5227            Name of the row / column that will contain the totals
5228            when margins is True.
5229
5230        Examples
5231        --------
5232        >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
5233        ...                          "bar", "bar", "bar", "bar"],
5234        ...                    "B": ["one", "one", "one", "two", "two",
5235        ...                          "one", "one", "two", "two"],
5236        ...                    "C": ["small", "large", "large", "small",
5237        ...                          "small", "large", "small", "small",
5238        ...                          "large"],
5239        ...                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]})
5240        >>> df
5241             A    B      C  D
5242        0  foo  one  small  1
5243        1  foo  one  large  2
5244        2  foo  one  large  2
5245        3  foo  two  small  3
5246        4  foo  two  small  3
5247        5  bar  one  large  4
5248        6  bar  one  small  5
5249        7  bar  two  small  6
5250        8  bar  two  large  7
5251
5252        >>> table = pivot_table(df, values='D', index=['A', 'B'],
5253        ...                     columns=['C'], aggfunc=np.sum)
5254        >>> table
5255        C        large  small
5256        A   B
5257        bar one    4.0    5.0
5258            two    7.0    6.0
5259        foo one    4.0    1.0
5260            two    NaN    6.0
5261
5262        >>> table = pivot_table(df, values='D', index=['A', 'B'],
5263        ...                     columns=['C'], aggfunc=np.sum)
5264        >>> table
5265        C        large  small
5266        A   B
5267        bar one    4.0    5.0
5268            two    7.0    6.0
5269        foo one    4.0    1.0
5270            two    NaN    6.0
5271
5272        >>> table = pivot_table(df, values=['D', 'E'], index=['A', 'C'],
5273        ...                     aggfunc={'D': np.mean,
5274        ...                              'E': [min, max, np.mean]})
5275        >>> table
5276                          D   E
5277                       mean max median min
5278        A   C
5279        bar large  5.500000  16   14.5  13
5280            small  5.500000  15   14.5  14
5281        foo large  2.000000  10    9.5   9
5282            small  2.333333  12   11.0   8
5283
5284        Returns
5285        -------
5286        table : DataFrame
5287
5288        See also
5289        --------
5290        DataFrame.pivot : pivot without aggregation that can handle
5291            non-numeric data
5292        """
5293
5294    @Substitution('')
5295    @Appender(_shared_docs['pivot_table'])
5296    def pivot_table(self, values=None, index=None, columns=None,
5297                    aggfunc='mean', fill_value=None, margins=False,
5298                    dropna=True, margins_name='All'):
5299        from pandas.core.reshape.pivot import pivot_table
5300        return pivot_table(self, values=values, index=index, columns=columns,
5301                           aggfunc=aggfunc, fill_value=fill_value,
5302                           margins=margins, dropna=dropna,
5303                           margins_name=margins_name)
5304
5305    def stack(self, level=-1, dropna=True):
5306        """
5307        Stack the prescribed level(s) from columns to index.
5308
5309        Return a reshaped DataFrame or Series having a multi-level
5310        index with one or more new inner-most levels compared to the current
5311        DataFrame. The new inner-most levels are created by pivoting the
5312        columns of the current dataframe:
5313
5314          - if the columns have a single level, the output is a Series;
5315          - if the columns have multiple levels, the new index
5316            level(s) is (are) taken from the prescribed level(s) and
5317            the output is a DataFrame.
5318
5319        The new index levels are sorted.
5320
5321        Parameters
5322        ----------
5323        level : int, str, list, default -1
5324            Level(s) to stack from the column axis onto the index
5325            axis, defined as one index or label, or a list of indices
5326            or labels.
5327        dropna : bool, default True
5328            Whether to drop rows in the resulting Frame/Series with
5329            missing values. Stacking a column level onto the index
5330            axis can create combinations of index and column values
5331            that are missing from the original dataframe. See Examples
5332            section.
5333
5334        Returns
5335        -------
5336        DataFrame or Series
5337            Stacked dataframe or series.
5338
5339        See Also
5340        --------
5341        DataFrame.unstack : Unstack prescribed level(s) from index axis
5342             onto column axis.
5343        DataFrame.pivot : Reshape dataframe from long format to wide
5344             format.
5345        DataFrame.pivot_table : Create a spreadsheet-style pivot table
5346             as a DataFrame.
5347
5348        Notes
5349        -----
5350        The function is named by analogy with a collection of books
5351        being re-organised from being side by side on a horizontal
5352        position (the columns of the dataframe) to being stacked
5353        vertically on top of of each other (in the index of the
5354        dataframe).
5355
5356        Examples
5357        --------
5358        **Single level columns**
5359
5360        >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
5361        ...                                     index=['cat', 'dog'],
5362        ...                                     columns=['weight', 'height'])
5363
5364        Stacking a dataframe with a single level column axis returns a Series:
5365
5366        >>> df_single_level_cols
5367             weight height
5368        cat       0      1
5369        dog       2      3
5370        >>> df_single_level_cols.stack()
5371        cat  weight    0
5372             height    1
5373        dog  weight    2
5374             height    3
5375        dtype: int64
5376
5377        **Multi level columns: simple case**
5378
5379        >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
5380        ...                                        ('weight', 'pounds')])
5381        >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
5382        ...                                     index=['cat', 'dog'],
5383        ...                                     columns=multicol1)
5384
5385        Stacking a dataframe with a multi-level column axis:
5386
5387        >>> df_multi_level_cols1
5388             weight
5389                 kg    pounds
5390        cat       1        2
5391        dog       2        4
5392        >>> df_multi_level_cols1.stack()
5393                    weight
5394        cat kg           1
5395            pounds       2
5396        dog kg           2
5397            pounds       4
5398
5399        **Missing values**
5400
5401        >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
5402        ...                                        ('height', 'm')])
5403        >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
5404        ...                                     index=['cat', 'dog'],
5405        ...                                     columns=multicol2)
5406
5407        It is common to have missing values when stacking a dataframe
5408        with multi-level columns, as the stacked dataframe typically
5409        has more values than the original dataframe. Missing values
5410        are filled with NaNs:
5411
5412        >>> df_multi_level_cols2
5413            weight height
5414                kg      m
5415        cat    1.0    2.0
5416        dog    3.0    4.0
5417        >>> df_multi_level_cols2.stack()
5418                height  weight
5419        cat kg     NaN     1.0
5420            m      2.0     NaN
5421        dog kg     NaN     3.0
5422            m      4.0     NaN
5423
5424        **Prescribing the level(s) to be stacked**
5425
5426        The first parameter controls which level or levels are stacked:
5427
5428        >>> df_multi_level_cols2.stack(0)
5429                     kg    m
5430        cat height  NaN  2.0
5431            weight  1.0  NaN
5432        dog height  NaN  4.0
5433            weight  3.0  NaN
5434        >>> df_multi_level_cols2.stack([0, 1])
5435        cat  height  m     2.0
5436             weight  kg    1.0
5437        dog  height  m     4.0
5438             weight  kg    3.0
5439        dtype: float64
5440
5441        **Dropping missing values**
5442
5443        >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
5444        ...                                     index=['cat', 'dog'],
5445        ...                                     columns=multicol2)
5446
5447        Note that rows where all values are missing are dropped by
5448        default but this behaviour can be controlled via the dropna
5449        keyword parameter:
5450
5451        >>> df_multi_level_cols3
5452            weight height
5453                kg      m
5454        cat    NaN    1.0
5455        dog    2.0    3.0
5456        >>> df_multi_level_cols3.stack(dropna=False)
5457                height  weight
5458        cat kg     NaN     NaN
5459            m      1.0     NaN
5460        dog kg     NaN     2.0
5461            m      3.0     NaN
5462        >>> df_multi_level_cols3.stack(dropna=True)
5463                height  weight
5464        cat m      1.0     NaN
5465        dog kg     NaN     2.0
5466            m      3.0     NaN
5467        """
5468        from pandas.core.reshape.reshape import stack, stack_multiple
5469
5470        if isinstance(level, (tuple, list)):
5471            return stack_multiple(self, level, dropna=dropna)
5472        else:
5473            return stack(self, level, dropna=dropna)
5474
5475    def unstack(self, level=-1, fill_value=None):
5476        """
5477        Pivot a level of the (necessarily hierarchical) index labels, returning
5478        a DataFrame having a new level of column labels whose inner-most level
5479        consists of the pivoted index labels. If the index is not a MultiIndex,
5480        the output will be a Series (the analogue of stack when the columns are
5481        not a MultiIndex).
5482        The level involved will automatically get sorted.
5483
5484        Parameters
5485        ----------
5486        level : int, string, or list of these, default -1 (last level)
5487            Level(s) of index to unstack, can pass level name
5488        fill_value : replace NaN with this value if the unstack produces
5489            missing values
5490
5491            .. versionadded:: 0.18.0
5492
5493        See also
5494        --------
5495        DataFrame.pivot : Pivot a table based on column values.
5496        DataFrame.stack : Pivot a level of the column labels (inverse operation
5497            from `unstack`).
5498
5499        Examples
5500        --------
5501        >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
5502        ...                                    ('two', 'a'), ('two', 'b')])
5503        >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
5504        >>> s
5505        one  a   1.0
5506             b   2.0
5507        two  a   3.0
5508             b   4.0
5509        dtype: float64
5510
5511        >>> s.unstack(level=-1)
5512             a   b
5513        one  1.0  2.0
5514        two  3.0  4.0
5515
5516        >>> s.unstack(level=0)
5517           one  two
5518        a  1.0   3.0
5519        b  2.0   4.0
5520
5521        >>> df = s.unstack(level=0)
5522        >>> df.unstack()
5523        one  a  1.0
5524             b  2.0
5525        two  a  3.0
5526             b  4.0
5527        dtype: float64
5528
5529        Returns
5530        -------
5531        unstacked : DataFrame or Series
5532        """
5533        from pandas.core.reshape.reshape import unstack
5534        return unstack(self, level, fill_value)
5535
5536    _shared_docs['melt'] = ("""
5537    "Unpivots" a DataFrame from wide format to long format, optionally
5538    leaving identifier variables set.
5539
5540    This function is useful to massage a DataFrame into a format where one
5541    or more columns are identifier variables (`id_vars`), while all other
5542    columns, considered measured variables (`value_vars`), are "unpivoted" to
5543    the row axis, leaving just two non-identifier columns, 'variable' and
5544    'value'.
5545
5546    %(versionadded)s
5547    Parameters
5548    ----------
5549    frame : DataFrame
5550    id_vars : tuple, list, or ndarray, optional
5551        Column(s) to use as identifier variables.
5552    value_vars : tuple, list, or ndarray, optional
5553        Column(s) to unpivot. If not specified, uses all columns that
5554        are not set as `id_vars`.
5555    var_name : scalar
5556        Name to use for the 'variable' column. If None it uses
5557        ``frame.columns.name`` or 'variable'.
5558    value_name : scalar, default 'value'
5559        Name to use for the 'value' column.
5560    col_level : int or string, optional
5561        If columns are a MultiIndex then use this level to melt.
5562
5563    See also
5564    --------
5565    %(other)s
5566    pivot_table
5567    DataFrame.pivot
5568
5569    Examples
5570    --------
5571    >>> import pandas as pd
5572    >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
5573    ...                    'B': {0: 1, 1: 3, 2: 5},
5574    ...                    'C': {0: 2, 1: 4, 2: 6}})
5575    >>> df
5576       A  B  C
5577    0  a  1  2
5578    1  b  3  4
5579    2  c  5  6
5580
5581    >>> %(caller)sid_vars=['A'], value_vars=['B'])
5582       A variable  value
5583    0  a        B      1
5584    1  b        B      3
5585    2  c        B      5
5586
5587    >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'])
5588       A variable  value
5589    0  a        B      1
5590    1  b        B      3
5591    2  c        B      5
5592    3  a        C      2
5593    4  b        C      4
5594    5  c        C      6
5595
5596    The names of 'variable' and 'value' columns can be customized:
5597
5598    >>> %(caller)sid_vars=['A'], value_vars=['B'],
5599    ...         var_name='myVarname', value_name='myValname')
5600       A myVarname  myValname
5601    0  a         B          1
5602    1  b         B          3
5603    2  c         B          5
5604
5605    If you have multi-index columns:
5606
5607    >>> df.columns = [list('ABC'), list('DEF')]
5608    >>> df
5609       A  B  C
5610       D  E  F
5611    0  a  1  2
5612    1  b  3  4
5613    2  c  5  6
5614
5615    >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B'])
5616       A variable  value
5617    0  a        B      1
5618    1  b        B      3
5619    2  c        B      5
5620
5621    >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')])
5622      (A, D) variable_0 variable_1  value
5623    0      a          B          E      1
5624    1      b          B          E      3
5625    2      c          B          E      5
5626
5627    """)
5628
5629    @Appender(_shared_docs['melt'] %
5630              dict(caller='df.melt(',
5631                   versionadded='.. versionadded:: 0.20.0\n',
5632                   other='melt'))
5633    def melt(self, id_vars=None, value_vars=None, var_name=None,
5634             value_name='value', col_level=None):
5635        from pandas.core.reshape.melt import melt
5636        return melt(self, id_vars=id_vars, value_vars=value_vars,
5637                    var_name=var_name, value_name=value_name,
5638                    col_level=col_level)
5639
5640    # ----------------------------------------------------------------------
5641    # Time series-related
5642
5643    def diff(self, periods=1, axis=0):
5644        """
5645        First discrete difference of element.
5646
5647        Calculates the difference of a DataFrame element compared with another
5648        element in the DataFrame (default is the element in the same column
5649        of the previous row).
5650
5651        Parameters
5652        ----------
5653        periods : int, default 1
5654            Periods to shift for calculating difference, accepts negative
5655            values.
5656        axis : {0 or 'index', 1 or 'columns'}, default 0
5657            Take difference over rows (0) or columns (1).
5658
5659            .. versionadded:: 0.16.1.
5660
5661        Returns
5662        -------
5663        diffed : DataFrame
5664
5665        See Also
5666        --------
5667        Series.diff: First discrete difference for a Series.
5668        DataFrame.pct_change: Percent change over given number of periods.
5669        DataFrame.shift: Shift index by desired number of periods with an
5670            optional time freq.
5671
5672        Examples
5673        --------
5674        Difference with previous row
5675
5676        >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
5677        ...                    'b': [1, 1, 2, 3, 5, 8],
5678        ...                    'c': [1, 4, 9, 16, 25, 36]})
5679        >>> df
5680           a  b   c
5681        0  1  1   1
5682        1  2  1   4
5683        2  3  2   9
5684        3  4  3  16
5685        4  5  5  25
5686        5  6  8  36
5687
5688        >>> df.diff()
5689             a    b     c
5690        0  NaN  NaN   NaN
5691        1  1.0  0.0   3.0
5692        2  1.0  1.0   5.0
5693        3  1.0  1.0   7.0
5694        4  1.0  2.0   9.0
5695        5  1.0  3.0  11.0
5696
5697        Difference with previous column
5698
5699        >>> df.diff(axis=1)
5700            a    b     c
5701        0 NaN  0.0   0.0
5702        1 NaN -1.0   3.0
5703        2 NaN -1.0   7.0
5704        3 NaN -1.0  13.0
5705        4 NaN  0.0  20.0
5706        5 NaN  2.0  28.0
5707
5708        Difference with 3rd previous row
5709
5710        >>> df.diff(periods=3)
5711             a    b     c
5712        0  NaN  NaN   NaN
5713        1  NaN  NaN   NaN
5714        2  NaN  NaN   NaN
5715        3  3.0  2.0  15.0
5716        4  3.0  4.0  21.0
5717        5  3.0  6.0  27.0
5718
5719        Difference with following row
5720
5721        >>> df.diff(periods=-1)
5722             a    b     c
5723        0 -1.0  0.0  -3.0
5724        1 -1.0 -1.0  -5.0
5725        2 -1.0 -1.0  -7.0
5726        3 -1.0 -2.0  -9.0
5727        4 -1.0 -3.0 -11.0
5728        5  NaN  NaN   NaN
5729        """
5730        bm_axis = self._get_block_manager_axis(axis)
5731        new_data = self._data.diff(n=periods, axis=bm_axis)
5732        return self._constructor(new_data)
5733
5734    # ----------------------------------------------------------------------
5735    # Function application
5736
5737    def _gotitem(self,
5738                 key,           # type: Union[str, List[str]]
5739                 ndim,          # type: int
5740                 subset=None    # type: Union[Series, DataFrame, None]
5741                 ):
5742        # type: (...) -> Union[Series, DataFrame]
5743        """
5744        sub-classes to define
5745        return a sliced object
5746
5747        Parameters
5748        ----------
5749        key : string / list of selections
5750        ndim : 1,2
5751            requested ndim of result
5752        subset : object, default None
5753            subset to act on
5754        """
5755        if subset is None:
5756            subset = self
5757        elif subset.ndim == 1:  # is Series
5758            return subset
5759
5760        # TODO: _shallow_copy(subset)?
5761        return subset[key]
5762
5763    _agg_doc = dedent("""
5764    The aggregation operations are always performed over an axis, either the
5765    index (default) or the column axis. This behavior is different from
5766    `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
5767    `var`), where the default is to compute the aggregation of the flattened
5768    array, e.g., ``numpy.mean(arr_2d)`` as opposed to ``numpy.mean(arr_2d,
5769    axis=0)``.
5770
5771    `agg` is an alias for `aggregate`. Use the alias.
5772
5773    Examples
5774    --------
5775    >>> df = pd.DataFrame([[1, 2, 3],
5776    ...                    [4, 5, 6],
5777    ...                    [7, 8, 9],
5778    ...                    [np.nan, np.nan, np.nan]],
5779    ...                   columns=['A', 'B', 'C'])
5780
5781    Aggregate these functions over the rows.
5782
5783    >>> df.agg(['sum', 'min'])
5784            A     B     C
5785    sum  12.0  15.0  18.0
5786    min   1.0   2.0   3.0
5787
5788    Different aggregations per column.
5789
5790    >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
5791            A    B
5792    max   NaN  8.0
5793    min   1.0  2.0
5794    sum  12.0  NaN
5795
5796    Aggregate over the columns.
5797
5798    >>> df.agg("mean", axis="columns")
5799    0    2.0
5800    1    5.0
5801    2    8.0
5802    3    NaN
5803    dtype: float64
5804
5805    See also
5806    --------
5807    DataFrame.apply : Perform any type of operations.
5808    DataFrame.transform : Perform transformation type operations.
5809    pandas.core.groupby.GroupBy : Perform operations over groups.
5810    pandas.core.resample.Resampler : Perform operations over resampled bins.
5811    pandas.core.window.Rolling : Perform operations over rolling window.
5812    pandas.core.window.Expanding : Perform operations over expanding window.
5813    pandas.core.window.EWM : Perform operation over exponential weighted
5814        window.
5815    """)
5816
5817    @Appender(_agg_doc)
5818    @Appender(_shared_docs['aggregate'] % dict(
5819        versionadded='.. versionadded:: 0.20.0',
5820        **_shared_doc_kwargs))
5821    def aggregate(self, func, axis=0, *args, **kwargs):
5822        axis = self._get_axis_number(axis)
5823
5824        # TODO: flipped axis
5825        result = None
5826        if axis == 0:
5827            try:
5828                result, how = self._aggregate(func, axis=0, *args, **kwargs)
5829            except TypeError:
5830                pass
5831        if result is None:
5832            return self.apply(func, axis=axis, args=args, **kwargs)
5833        return result
5834
5835    agg = aggregate
5836
5837    def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None,
5838              result_type=None, args=(), **kwds):
5839        """
5840        Apply a function along an axis of the DataFrame.
5841
5842        Objects passed to the function are Series objects whose index is
5843        either the DataFrame's index (``axis=0``) or the DataFrame's columns
5844        (``axis=1``). By default (``result_type=None``), the final return type
5845        is inferred from the return type of the applied function. Otherwise,
5846        it depends on the `result_type` argument.
5847
5848        Parameters
5849        ----------
5850        func : function
5851            Function to apply to each column or row.
5852        axis : {0 or 'index', 1 or 'columns'}, default 0
5853            Axis along which the function is applied:
5854
5855            * 0 or 'index': apply function to each column.
5856            * 1 or 'columns': apply function to each row.
5857        broadcast : bool, optional
5858            Only relevant for aggregation functions:
5859
5860            * ``False`` or ``None`` : returns a Series whose length is the
5861              length of the index or the number of columns (based on the
5862              `axis` parameter)
5863            * ``True`` : results will be broadcast to the original shape
5864              of the frame, the original index and columns will be retained.
5865
5866            .. deprecated:: 0.23.0
5867               This argument will be removed in a future version, replaced
5868               by result_type='broadcast'.
5869
5870        raw : bool, default False
5871            * ``False`` : passes each row or column as a Series to the
5872              function.
5873            * ``True`` : the passed function will receive ndarray objects
5874              instead.
5875              If you are just applying a NumPy reduction function this will
5876              achieve much better performance.
5877        reduce : bool or None, default None
5878            Try to apply reduction procedures. If the DataFrame is empty,
5879            `apply` will use `reduce` to determine whether the result
5880            should be a Series or a DataFrame. If ``reduce=None`` (the
5881            default), `apply`'s return value will be guessed by calling
5882            `func` on an empty Series
5883            (note: while guessing, exceptions raised by `func` will be
5884            ignored).
5885            If ``reduce=True`` a Series will always be returned, and if
5886            ``reduce=False`` a DataFrame will always be returned.
5887
5888            .. deprecated:: 0.23.0
5889               This argument will be removed in a future version, replaced
5890               by ``result_type='reduce'``.
5891
5892        result_type : {'expand', 'reduce', 'broadcast', None}, default None
5893            These only act when ``axis=1`` (columns):
5894
5895            * 'expand' : list-like results will be turned into columns.
5896            * 'reduce' : returns a Series if possible rather than expanding
5897              list-like results. This is the opposite of 'expand'.
5898            * 'broadcast' : results will be broadcast to the original shape
5899              of the DataFrame, the original index and columns will be
5900              retained.
5901
5902            The default behaviour (None) depends on the return value of the
5903            applied function: list-like results will be returned as a Series
5904            of those. However if the apply function returns a Series these
5905            are expanded to columns.
5906
5907            .. versionadded:: 0.23.0
5908
5909        args : tuple
5910            Positional arguments to pass to `func` in addition to the
5911            array/series.
5912        **kwds
5913            Additional keyword arguments to pass as keywords arguments to
5914            `func`.
5915
5916        Notes
5917        -----
5918        In the current implementation apply calls `func` twice on the
5919        first column/row to decide whether it can take a fast or slow
5920        code path. This can lead to unexpected behavior if `func` has
5921        side-effects, as they will take effect twice for the first
5922        column/row.
5923
5924        See also
5925        --------
5926        DataFrame.applymap: For elementwise operations
5927        DataFrame.aggregate: only perform aggregating type operations
5928        DataFrame.transform: only perform transformating type operations
5929
5930        Examples
5931        --------
5932
5933        >>> df = pd.DataFrame([[4, 9],] * 3, columns=['A', 'B'])
5934        >>> df
5935           A  B
5936        0  4  9
5937        1  4  9
5938        2  4  9
5939
5940        Using a numpy universal function (in this case the same as
5941        ``np.sqrt(df)``):
5942
5943        >>> df.apply(np.sqrt)
5944             A    B
5945        0  2.0  3.0
5946        1  2.0  3.0
5947        2  2.0  3.0
5948
5949        Using a reducing function on either axis
5950
5951        >>> df.apply(np.sum, axis=0)
5952        A    12
5953        B    27
5954        dtype: int64
5955
5956        >>> df.apply(np.sum, axis=1)
5957        0    13
5958        1    13
5959        2    13
5960        dtype: int64
5961
5962        Retuning a list-like will result in a Series
5963
5964        >>> df.apply(lambda x: [1, 2], axis=1)
5965        0    [1, 2]
5966        1    [1, 2]
5967        2    [1, 2]
5968        dtype: object
5969
5970        Passing result_type='expand' will expand list-like results
5971        to columns of a Dataframe
5972
5973        >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
5974           0  1
5975        0  1  2
5976        1  1  2
5977        2  1  2
5978
5979        Returning a Series inside the function is similar to passing
5980        ``result_type='expand'``. The resulting column names
5981        will be the Series index.
5982
5983        >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
5984           foo  bar
5985        0    1    2
5986        1    1    2
5987        2    1    2
5988
5989        Passing ``result_type='broadcast'`` will ensure the same shape
5990        result, whether list-like or scalar is returned by the function,
5991        and broadcast it along the axis. The resulting column names will
5992        be the originals.
5993
5994        >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
5995           A  B
5996        0  1  2
5997        1  1  2
5998        2  1  2
5999
6000        Returns
6001        -------
6002        applied : Series or DataFrame
6003        """
6004        from pandas.core.apply import frame_apply
6005        op = frame_apply(self,
6006                         func=func,
6007                         axis=axis,
6008                         broadcast=broadcast,
6009                         raw=raw,
6010                         reduce=reduce,
6011                         result_type=result_type,
6012                         args=args,
6013                         kwds=kwds)
6014        return op.get_result()
6015
6016    def applymap(self, func):
6017        """
6018        Apply a function to a Dataframe elementwise.
6019
6020        This method applies a function that accepts and returns a scalar
6021        to every element of a DataFrame.
6022
6023        Parameters
6024        ----------
6025        func : callable
6026            Python function, returns a single value from a single value.
6027
6028        Returns
6029        -------
6030        DataFrame
6031            Transformed DataFrame.
6032
6033        See also
6034        --------
6035        DataFrame.apply : Apply a function along input axis of DataFrame
6036
6037        Examples
6038        --------
6039        >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
6040        >>> df
6041               0      1
6042        0  1.000  2.120
6043        1  3.356  4.567
6044
6045        >>> df.applymap(lambda x: len(str(x)))
6046           0  1
6047        0  3  4
6048        1  5  5
6049
6050        Note that a vectorized version of `func` often exists, which will
6051        be much faster. You could square each number elementwise.
6052
6053        >>> df.applymap(lambda x: x**2)
6054                   0          1
6055        0   1.000000   4.494400
6056        1  11.262736  20.857489
6057
6058        But it's better to avoid applymap in that case.
6059
6060        >>> df ** 2
6061                   0          1
6062        0   1.000000   4.494400
6063        1  11.262736  20.857489
6064        """
6065
6066        # if we have a dtype == 'M8[ns]', provide boxed values
6067        def infer(x):
6068            if x.empty:
6069                return lib.map_infer(x, func)
6070            return lib.map_infer(x.astype(object).values, func)
6071
6072        return self.apply(infer)
6073
6074    # ----------------------------------------------------------------------
6075    # Merging / joining methods
6076
6077    def append(self, other, ignore_index=False,
6078               verify_integrity=False, sort=None):
6079        """
6080        Append rows of `other` to the end of this frame, returning a new
6081        object. Columns not in this frame are added as new columns.
6082
6083        Parameters
6084        ----------
6085        other : DataFrame or Series/dict-like object, or list of these
6086            The data to append.
6087        ignore_index : boolean, default False
6088            If True, do not use the index labels.
6089        verify_integrity : boolean, default False
6090            If True, raise ValueError on creating index with duplicates.
6091        sort : boolean, default None
6092            Sort columns if the columns of `self` and `other` are not aligned.
6093            The default sorting is deprecated and will change to not-sorting
6094            in a future version of pandas. Explicitly pass ``sort=True`` to
6095            silence the warning and sort. Explicitly pass ``sort=False`` to
6096            silence the warning and not sort.
6097
6098            .. versionadded:: 0.23.0
6099
6100        Returns
6101        -------
6102        appended : DataFrame
6103
6104        Notes
6105        -----
6106        If a list of dict/series is passed and the keys are all contained in
6107        the DataFrame's index, the order of the columns in the resulting
6108        DataFrame will be unchanged.
6109
6110        Iteratively appending rows to a DataFrame can be more computationally
6111        intensive than a single concatenate. A better solution is to append
6112        those rows to a list and then concatenate the list with the original
6113        DataFrame all at once.
6114
6115        See also
6116        --------
6117        pandas.concat : General function to concatenate DataFrame, Series
6118            or Panel objects
6119
6120        Examples
6121        --------
6122
6123        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
6124        >>> df
6125           A  B
6126        0  1  2
6127        1  3  4
6128        >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
6129        >>> df.append(df2)
6130           A  B
6131        0  1  2
6132        1  3  4
6133        0  5  6
6134        1  7  8
6135
6136        With `ignore_index` set to True:
6137
6138        >>> df.append(df2, ignore_index=True)
6139           A  B
6140        0  1  2
6141        1  3  4
6142        2  5  6
6143        3  7  8
6144
6145        The following, while not recommended methods for generating DataFrames,
6146        show two ways to generate a DataFrame from multiple data sources.
6147
6148        Less efficient:
6149
6150        >>> df = pd.DataFrame(columns=['A'])
6151        >>> for i in range(5):
6152        ...     df = df.append({'A': i}, ignore_index=True)
6153        >>> df
6154           A
6155        0  0
6156        1  1
6157        2  2
6158        3  3
6159        4  4
6160
6161        More efficient:
6162
6163        >>> pd.concat([pd.DataFrame([i], columns=['A']) for i in range(5)],
6164        ...           ignore_index=True)
6165           A
6166        0  0
6167        1  1
6168        2  2
6169        3  3
6170        4  4
6171
6172        """
6173        if isinstance(other, (Series, dict)):
6174            if isinstance(other, dict):
6175                other = Series(other)
6176            if other.name is None and not ignore_index:
6177                raise TypeError('Can only append a Series if ignore_index=True'
6178                                ' or if the Series has a name')
6179
6180            if other.name is None:
6181                index = None
6182            else:
6183                # other must have the same index name as self, otherwise
6184                # index name will be reset
6185                index = Index([other.name], name=self.index.name)
6186
6187            idx_diff = other.index.difference(self.columns)
6188            try:
6189                combined_columns = self.columns.append(idx_diff)
6190            except TypeError:
6191                combined_columns = self.columns.astype(object).append(idx_diff)
6192            other = other.reindex(combined_columns, copy=False)
6193            other = DataFrame(other.values.reshape((1, len(other))),
6194                              index=index,
6195                              columns=combined_columns)
6196            other = other._convert(datetime=True, timedelta=True)
6197            if not self.columns.equals(combined_columns):
6198                self = self.reindex(columns=combined_columns)
6199        elif isinstance(other, list) and not isinstance(other[0], DataFrame):
6200            other = DataFrame(other)
6201            if (self.columns.get_indexer(other.columns) >= 0).all():
6202                other = other.loc[:, self.columns]
6203
6204        from pandas.core.reshape.concat import concat
6205        if isinstance(other, (list, tuple)):
6206            to_concat = [self] + other
6207        else:
6208            to_concat = [self, other]
6209        return concat(to_concat, ignore_index=ignore_index,
6210                      verify_integrity=verify_integrity,
6211                      sort=sort)
6212
6213    def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
6214             sort=False):
6215        """
6216        Join columns with other DataFrame either on index or on a key
6217        column. Efficiently Join multiple DataFrame objects by index at once by
6218        passing a list.
6219
6220        Parameters
6221        ----------
6222        other : DataFrame, Series with name field set, or list of DataFrame
6223            Index should be similar to one of the columns in this one. If a
6224            Series is passed, its name attribute must be set, and that will be
6225            used as the column name in the resulting joined DataFrame
6226        on : name, tuple/list of names, or array-like
6227            Column or index level name(s) in the caller to join on the index
6228            in `other`, otherwise joins index-on-index. If multiple
6229            values given, the `other` DataFrame must have a MultiIndex. Can
6230            pass an array as the join key if it is not already contained in
6231            the calling DataFrame. Like an Excel VLOOKUP operation
6232        how : {'left', 'right', 'outer', 'inner'}, default: 'left'
6233            How to handle the operation of the two objects.
6234
6235            * left: use calling frame's index (or column if on is specified)
6236            * right: use other frame's index
6237            * outer: form union of calling frame's index (or column if on is
6238              specified) with other frame's index, and sort it
6239              lexicographically
6240            * inner: form intersection of calling frame's index (or column if
6241              on is specified) with other frame's index, preserving the order
6242              of the calling's one
6243        lsuffix : string
6244            Suffix to use from left frame's overlapping columns
6245        rsuffix : string
6246            Suffix to use from right frame's overlapping columns
6247        sort : boolean, default False
6248            Order result DataFrame lexicographically by the join key. If False,
6249            the order of the join key depends on the join type (how keyword)
6250
6251        Notes
6252        -----
6253        on, lsuffix, and rsuffix options are not supported when passing a list
6254        of DataFrame objects
6255
6256        Support for specifying index levels as the `on` parameter was added
6257        in version 0.23.0
6258
6259        Examples
6260        --------
6261        >>> caller = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
6262        ...                        'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
6263
6264        >>> caller
6265            A key
6266        0  A0  K0
6267        1  A1  K1
6268        2  A2  K2
6269        3  A3  K3
6270        4  A4  K4
6271        5  A5  K5
6272
6273        >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
6274        ...                       'B': ['B0', 'B1', 'B2']})
6275
6276        >>> other
6277            B key
6278        0  B0  K0
6279        1  B1  K1
6280        2  B2  K2
6281
6282        Join DataFrames using their indexes.
6283
6284        >>> caller.join(other, lsuffix='_caller', rsuffix='_other')
6285
6286        >>>     A key_caller    B key_other
6287            0  A0         K0   B0        K0
6288            1  A1         K1   B1        K1
6289            2  A2         K2   B2        K2
6290            3  A3         K3  NaN       NaN
6291            4  A4         K4  NaN       NaN
6292            5  A5         K5  NaN       NaN
6293
6294
6295        If we want to join using the key columns, we need to set key to be
6296        the index in both caller and other. The joined DataFrame will have
6297        key as its index.
6298
6299        >>> caller.set_index('key').join(other.set_index('key'))
6300
6301        >>>      A    B
6302            key
6303            K0   A0   B0
6304            K1   A1   B1
6305            K2   A2   B2
6306            K3   A3  NaN
6307            K4   A4  NaN
6308            K5   A5  NaN
6309
6310        Another option to join using the key columns is to use the on
6311        parameter. DataFrame.join always uses other's index but we can use any
6312        column in the caller. This method preserves the original caller's
6313        index in the result.
6314
6315        >>> caller.join(other.set_index('key'), on='key')
6316
6317        >>>     A key    B
6318            0  A0  K0   B0
6319            1  A1  K1   B1
6320            2  A2  K2   B2
6321            3  A3  K3  NaN
6322            4  A4  K4  NaN
6323            5  A5  K5  NaN
6324
6325
6326        See also
6327        --------
6328        DataFrame.merge : For column(s)-on-columns(s) operations
6329
6330        Returns
6331        -------
6332        joined : DataFrame
6333        """
6334        # For SparseDataFrame's benefit
6335        return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
6336                                 rsuffix=rsuffix, sort=sort)
6337
6338    def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
6339                     sort=False):
6340        from pandas.core.reshape.merge import merge
6341        from pandas.core.reshape.concat import concat
6342
6343        if isinstance(other, Series):
6344            if other.name is None:
6345                raise ValueError('Other Series must have a name')
6346            other = DataFrame({other.name: other})
6347
6348        if isinstance(other, DataFrame):
6349            return merge(self, other, left_on=on, how=how,
6350                         left_index=on is None, right_index=True,
6351                         suffixes=(lsuffix, rsuffix), sort=sort)
6352        else:
6353            if on is not None:
6354                raise ValueError('Joining multiple DataFrames only supported'
6355                                 ' for joining on index')
6356
6357            frames = [self] + list(other)
6358
6359            can_concat = all(df.index.is_unique for df in frames)
6360
6361            # join indexes only using concat
6362            if can_concat:
6363                if how == 'left':
6364                    how = 'outer'
6365                    join_axes = [self.index]
6366                else:
6367                    join_axes = None
6368                return concat(frames, axis=1, join=how, join_axes=join_axes,
6369                              verify_integrity=True)
6370
6371            joined = frames[0]
6372
6373            for frame in frames[1:]:
6374                joined = merge(joined, frame, how=how, left_index=True,
6375                               right_index=True)
6376
6377            return joined
6378
6379    @Substitution('')
6380    @Appender(_merge_doc, indents=2)
6381    def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
6382              left_index=False, right_index=False, sort=False,
6383              suffixes=('_x', '_y'), copy=True, indicator=False,
6384              validate=None):
6385        from pandas.core.reshape.merge import merge
6386        return merge(self, right, how=how, on=on, left_on=left_on,
6387                     right_on=right_on, left_index=left_index,
6388                     right_index=right_index, sort=sort, suffixes=suffixes,
6389                     copy=copy, indicator=indicator, validate=validate)
6390
6391    def round(self, decimals=0, *args, **kwargs):
6392        """
6393        Round a DataFrame to a variable number of decimal places.
6394
6395        Parameters
6396        ----------
6397        decimals : int, dict, Series
6398            Number of decimal places to round each column to. If an int is
6399            given, round each column to the same number of places.
6400            Otherwise dict and Series round to variable numbers of places.
6401            Column names should be in the keys if `decimals` is a
6402            dict-like, or in the index if `decimals` is a Series. Any
6403            columns not included in `decimals` will be left as is. Elements
6404            of `decimals` which are not columns of the input will be
6405            ignored.
6406
6407        Examples
6408        --------
6409        >>> df = pd.DataFrame(np.random.random([3, 3]),
6410        ...     columns=['A', 'B', 'C'], index=['first', 'second', 'third'])
6411        >>> df
6412                       A         B         C
6413        first   0.028208  0.992815  0.173891
6414        second  0.038683  0.645646  0.577595
6415        third   0.877076  0.149370  0.491027
6416        >>> df.round(2)
6417                   A     B     C
6418        first   0.03  0.99  0.17
6419        second  0.04  0.65  0.58
6420        third   0.88  0.15  0.49
6421        >>> df.round({'A': 1, 'C': 2})
6422                  A         B     C
6423        first   0.0  0.992815  0.17
6424        second  0.0  0.645646  0.58
6425        third   0.9  0.149370  0.49
6426        >>> decimals = pd.Series([1, 0, 2], index=['A', 'B', 'C'])
6427        >>> df.round(decimals)
6428                  A  B     C
6429        first   0.0  1  0.17
6430        second  0.0  1  0.58
6431        third   0.9  0  0.49
6432
6433        Returns
6434        -------
6435        DataFrame object
6436
6437        See Also
6438        --------
6439        numpy.around
6440        Series.round
6441
6442        """
6443        from pandas.core.reshape.concat import concat
6444
6445        def _dict_round(df, decimals):
6446            for col, vals in df.iteritems():
6447                try:
6448                    yield _series_round(vals, decimals[col])
6449                except KeyError:
6450                    yield vals
6451
6452        def _series_round(s, decimals):
6453            if is_integer_dtype(s) or is_float_dtype(s):
6454                return s.round(decimals)
6455            return s
6456
6457        nv.validate_round(args, kwargs)
6458
6459        if isinstance(decimals, (dict, Series)):
6460            if isinstance(decimals, Series):
6461                if not decimals.index.is_unique:
6462                    raise ValueError("Index of decimals must be unique")
6463            new_cols = [col for col in _dict_round(self, decimals)]
6464        elif is_integer(decimals):
6465            # Dispatch to Series.round
6466            new_cols = [_series_round(v, decimals)
6467                        for _, v in self.iteritems()]
6468        else:
6469            raise TypeError("decimals must be an integer, a dict-like or a "
6470                            "Series")
6471
6472        if len(new_cols) > 0:
6473            return self._constructor(concat(new_cols, axis=1),
6474                                     index=self.index,
6475                                     columns=self.columns)
6476        else:
6477            return self
6478
6479    # ----------------------------------------------------------------------
6480    # Statistical methods, etc.
6481
6482    def corr(self, method='pearson', min_periods=1):
6483        """
6484        Compute pairwise correlation of columns, excluding NA/null values
6485
6486        Parameters
6487        ----------
6488        method : {'pearson', 'kendall', 'spearman'}
6489            * pearson : standard correlation coefficient
6490            * kendall : Kendall Tau correlation coefficient
6491            * spearman : Spearman rank correlation
6492        min_periods : int, optional
6493            Minimum number of observations required per pair of columns
6494            to have a valid result. Currently only available for pearson
6495            and spearman correlation
6496
6497        Returns
6498        -------
6499        y : DataFrame
6500        """
6501        numeric_df = self._get_numeric_data()
6502        cols = numeric_df.columns
6503        idx = cols.copy()
6504        mat = numeric_df.values
6505
6506        if method == 'pearson':
6507            correl = libalgos.nancorr(_ensure_float64(mat), minp=min_periods)
6508        elif method == 'spearman':
6509            correl = libalgos.nancorr_spearman(_ensure_float64(mat),
6510                                               minp=min_periods)
6511        else:
6512            if min_periods is None:
6513                min_periods = 1
6514            mat = _ensure_float64(mat).T
6515            corrf = nanops.get_corr_func(method)
6516            K = len(cols)
6517            correl = np.empty((K, K), dtype=float)
6518            mask = np.isfinite(mat)
6519            for i, ac in enumerate(mat):
6520                for j, bc in enumerate(mat):
6521                    if i > j:
6522                        continue
6523
6524                    valid = mask[i] & mask[j]
6525                    if valid.sum() < min_periods:
6526                        c = np.nan
6527                    elif i == j:
6528                        c = 1.
6529                    elif not valid.all():
6530                        c = corrf(ac[valid], bc[valid])
6531                    else:
6532                        c = corrf(ac, bc)
6533                    correl[i, j] = c
6534                    correl[j, i] = c
6535
6536        return self._constructor(correl, index=idx, columns=cols)
6537
6538    def cov(self, min_periods=None):
6539        """
6540        Compute pairwise covariance of columns, excluding NA/null values.
6541
6542        Compute the pairwise covariance among the series of a DataFrame.
6543        The returned data frame is the `covariance matrix
6544        <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
6545        of the DataFrame.
6546
6547        Both NA and null values are automatically excluded from the
6548        calculation. (See the note below about bias from missing values.)
6549        A threshold can be set for the minimum number of
6550        observations for each value created. Comparisons with observations
6551        below this threshold will be returned as ``NaN``.
6552
6553        This method is generally used for the analysis of time series data to
6554        understand the relationship between different measures
6555        across time.
6556
6557        Parameters
6558        ----------
6559        min_periods : int, optional
6560            Minimum number of observations required per pair of columns
6561            to have a valid result.
6562
6563        Returns
6564        -------
6565        DataFrame
6566            The covariance matrix of the series of the DataFrame.
6567
6568        See Also
6569        --------
6570        pandas.Series.cov : compute covariance with another Series
6571        pandas.core.window.EWM.cov: expoential weighted sample covariance
6572        pandas.core.window.Expanding.cov : expanding sample covariance
6573        pandas.core.window.Rolling.cov : rolling sample covariance
6574
6575        Notes
6576        -----
6577        Returns the covariance matrix of the DataFrame's time series.
6578        The covariance is normalized by N-1.
6579
6580        For DataFrames that have Series that are missing data (assuming that
6581        data is `missing at random
6582        <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
6583        the returned covariance matrix will be an unbiased estimate
6584        of the variance and covariance between the member Series.
6585
6586        However, for many applications this estimate may not be acceptable
6587        because the estimate covariance matrix is not guaranteed to be positive
6588        semi-definite. This could lead to estimate correlations having
6589        absolute values which are greater than one, and/or a non-invertible
6590        covariance matrix. See `Estimation of covariance matrices
6591        <http://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
6592        matrices>`__ for more details.
6593
6594        Examples
6595        --------
6596        >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
6597        ...                   columns=['dogs', 'cats'])
6598        >>> df.cov()
6599                  dogs      cats
6600        dogs  0.666667 -1.000000
6601        cats -1.000000  1.666667
6602
6603        >>> np.random.seed(42)
6604        >>> df = pd.DataFrame(np.random.randn(1000, 5),
6605        ...                   columns=['a', 'b', 'c', 'd', 'e'])
6606        >>> df.cov()
6607                  a         b         c         d         e
6608        a  0.998438 -0.020161  0.059277 -0.008943  0.014144
6609        b -0.020161  1.059352 -0.008543 -0.024738  0.009826
6610        c  0.059277 -0.008543  1.010670 -0.001486 -0.000271
6611        d -0.008943 -0.024738 -0.001486  0.921297 -0.013692
6612        e  0.014144  0.009826 -0.000271 -0.013692  0.977795
6613
6614        **Minimum number of periods**
6615
6616        This method also supports an optional ``min_periods`` keyword
6617        that specifies the required minimum number of non-NA observations for
6618        each column pair in order to have a valid result:
6619
6620        >>> np.random.seed(42)
6621        >>> df = pd.DataFrame(np.random.randn(20, 3),
6622        ...                   columns=['a', 'b', 'c'])
6623        >>> df.loc[df.index[:5], 'a'] = np.nan
6624        >>> df.loc[df.index[5:10], 'b'] = np.nan
6625        >>> df.cov(min_periods=12)
6626                  a         b         c
6627        a  0.316741       NaN -0.150812
6628        b       NaN  1.248003  0.191417
6629        c -0.150812  0.191417  0.895202
6630        """
6631        numeric_df = self._get_numeric_data()
6632        cols = numeric_df.columns
6633        idx = cols.copy()
6634        mat = numeric_df.values
6635
6636        if notna(mat).all():
6637            if min_periods is not None and min_periods > len(mat):
6638                baseCov = np.empty((mat.shape[1], mat.shape[1]))
6639                baseCov.fill(np.nan)
6640            else:
6641                baseCov = np.cov(mat.T)
6642            baseCov = baseCov.reshape((len(cols), len(cols)))
6643        else:
6644            baseCov = libalgos.nancorr(_ensure_float64(mat), cov=True,
6645                                       minp=min_periods)
6646
6647        return self._constructor(baseCov, index=idx, columns=cols)
6648
6649    def corrwith(self, other, axis=0, drop=False):
6650        """
6651        Compute pairwise correlation between rows or columns of two DataFrame
6652        objects.
6653
6654        Parameters
6655        ----------
6656        other : DataFrame, Series
6657        axis : {0 or 'index', 1 or 'columns'}, default 0
6658            0 or 'index' to compute column-wise, 1 or 'columns' for row-wise
6659        drop : boolean, default False
6660            Drop missing indices from result, default returns union of all
6661
6662        Returns
6663        -------
6664        correls : Series
6665        """
6666        axis = self._get_axis_number(axis)
6667        this = self._get_numeric_data()
6668
6669        if isinstance(other, Series):
6670            return this.apply(other.corr, axis=axis)
6671
6672        other = other._get_numeric_data()
6673
6674        left, right = this.align(other, join='inner', copy=False)
6675
6676        # mask missing values
6677        left = left + right * 0
6678        right = right + left * 0
6679
6680        if axis == 1:
6681            left = left.T
6682            right = right.T
6683
6684        # demeaned data
6685        ldem = left - left.mean()
6686        rdem = right - right.mean()
6687
6688        num = (ldem * rdem).sum()
6689        dom = (left.count() - 1) * left.std() * right.std()
6690
6691        correl = num / dom
6692
6693        if not drop:
6694            raxis = 1 if axis == 0 else 0
6695            result_index = this._get_axis(raxis).union(other._get_axis(raxis))
6696            correl = correl.reindex(result_index)
6697
6698        return correl
6699
6700    # ----------------------------------------------------------------------
6701    # ndarray-like stats methods
6702
6703    def count(self, axis=0, level=None, numeric_only=False):
6704        """
6705        Count non-NA cells for each column or row.
6706
6707        The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
6708        on `pandas.options.mode.use_inf_as_na`) are considered NA.
6709
6710        Parameters
6711        ----------
6712        axis : {0 or 'index', 1 or 'columns'}, default 0
6713            If 0 or 'index' counts are generated for each column.
6714            If 1 or 'columns' counts are generated for each **row**.
6715        level : int or str, optional
6716            If the axis is a `MultiIndex` (hierarchical), count along a
6717            particular `level`, collapsing into a `DataFrame`.
6718            A `str` specifies the level name.
6719        numeric_only : boolean, default False
6720            Include only `float`, `int` or `boolean` data.
6721
6722        Returns
6723        -------
6724        Series or DataFrame
6725            For each column/row the number of non-NA/null entries.
6726            If `level` is specified returns a `DataFrame`.
6727
6728        See Also
6729        --------
6730        Series.count: number of non-NA elements in a Series
6731        DataFrame.shape: number of DataFrame rows and columns (including NA
6732            elements)
6733        DataFrame.isna: boolean same-sized DataFrame showing places of NA
6734            elements
6735
6736        Examples
6737        --------
6738        Constructing DataFrame from a dictionary:
6739
6740        >>> df = pd.DataFrame({"Person":
6741        ...                    ["John", "Myla", None, "John", "Myla"],
6742        ...                    "Age": [24., np.nan, 21., 33, 26],
6743        ...                    "Single": [False, True, True, True, False]})
6744        >>> df
6745           Person   Age  Single
6746        0    John  24.0   False
6747        1    Myla   NaN    True
6748        2    None  21.0    True
6749        3    John  33.0    True
6750        4    Myla  26.0   False
6751
6752        Notice the uncounted NA values:
6753
6754        >>> df.count()
6755        Person    4
6756        Age       4
6757        Single    5
6758        dtype: int64
6759
6760        Counts for each **row**:
6761
6762        >>> df.count(axis='columns')
6763        0    3
6764        1    2
6765        2    2
6766        3    3
6767        4    3
6768        dtype: int64
6769
6770        Counts for one level of a `MultiIndex`:
6771
6772        >>> df.set_index(["Person", "Single"]).count(level="Person")
6773                Age
6774        Person
6775        John      2
6776        Myla      1
6777        """
6778        axis = self._get_axis_number(axis)
6779        if level is not None:
6780            return self._count_level(level, axis=axis,
6781                                     numeric_only=numeric_only)
6782
6783        if numeric_only:
6784            frame = self._get_numeric_data()
6785        else:
6786            frame = self
6787
6788        # GH #423
6789        if len(frame._get_axis(axis)) == 0:
6790            result = Series(0, index=frame._get_agg_axis(axis))
6791        else:
6792            if frame._is_mixed_type or frame._data.any_extension_types:
6793                # the or any_extension_types is really only hit for single-
6794                # column frames with an extension array
6795                result = notna(frame).sum(axis=axis)
6796            else:
6797                # GH13407
6798                series_counts = notna(frame).sum(axis=axis)
6799                counts = series_counts.values
6800                result = Series(counts, index=frame._get_agg_axis(axis))
6801
6802        return result.astype('int64')
6803
6804    def _count_level(self, level, axis=0, numeric_only=False):
6805        if numeric_only:
6806            frame = self._get_numeric_data()
6807        else:
6808            frame = self
6809
6810        count_axis = frame._get_axis(axis)
6811        agg_axis = frame._get_agg_axis(axis)
6812
6813        if not isinstance(count_axis, MultiIndex):
6814            raise TypeError("Can only count levels on hierarchical "
6815                            "{ax}.".format(ax=self._get_axis_name(axis)))
6816
6817        if frame._is_mixed_type:
6818            # Since we have mixed types, calling notna(frame.values) might
6819            # upcast everything to object
6820            mask = notna(frame).values
6821        else:
6822            # But use the speedup when we have homogeneous dtypes
6823            mask = notna(frame.values)
6824
6825        if axis == 1:
6826            # We're transposing the mask rather than frame to avoid potential
6827            # upcasts to object, which induces a ~20x slowdown
6828            mask = mask.T
6829
6830        if isinstance(level, compat.string_types):
6831            level = count_axis._get_level_number(level)
6832
6833        level_index = count_axis.levels[level]
6834        labels = _ensure_int64(count_axis.labels[level])
6835        counts = lib.count_level_2d(mask, labels, len(level_index), axis=0)
6836
6837        result = DataFrame(counts, index=level_index, columns=agg_axis)
6838
6839        if axis == 1:
6840            # Undo our earlier transpose
6841            return result.T
6842        else:
6843            return result
6844
6845    def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
6846                filter_type=None, **kwds):
6847        if axis is None and filter_type == 'bool':
6848            labels = None
6849            constructor = None
6850        else:
6851            # TODO: Make other agg func handle axis=None properly
6852            axis = self._get_axis_number(axis)
6853            labels = self._get_agg_axis(axis)
6854            constructor = self._constructor
6855
6856        def f(x):
6857            return op(x, axis=axis, skipna=skipna, **kwds)
6858
6859        # exclude timedelta/datetime unless we are uniform types
6860        if axis == 1 and self._is_mixed_type and self._is_datelike_mixed_type:
6861            numeric_only = True
6862
6863        if numeric_only is None:
6864            try:
6865                values = self.values
6866                result = f(values)
6867
6868                if (filter_type == 'bool' and is_object_dtype(values) and
6869                        axis is None):
6870                    # work around https://github.com/numpy/numpy/issues/10489
6871                    # TODO: combine with hasattr(result, 'dtype') further down
6872                    # hard since we don't have `values` down there.
6873                    result = np.bool_(result)
6874            except Exception as e:
6875
6876                # try by-column first
6877                if filter_type is None and axis == 0:
6878                    try:
6879
6880                        # this can end up with a non-reduction
6881                        # but not always. if the types are mixed
6882                        # with datelike then need to make sure a series
6883
6884                        # we only end up here if we have not specified
6885                        # numeric_only and yet we have tried a
6886                        # column-by-column reduction, where we have mixed type.
6887                        # So let's just do what we can
6888                        from pandas.core.apply import frame_apply
6889                        opa = frame_apply(self,
6890                                          func=f,
6891                                          result_type='expand',
6892                                          ignore_failures=True)
6893                        result = opa.get_result()
6894                        if result.ndim == self.ndim:
6895                            result = result.iloc[0]
6896                        return result
6897                    except Exception:
6898                        pass
6899
6900                if filter_type is None or filter_type == 'numeric':
6901                    data = self._get_numeric_data()
6902                elif filter_type == 'bool':
6903                    data = self._get_bool_data()
6904                else:  # pragma: no cover
6905                    e = NotImplementedError(
6906                        "Handling exception with filter_type {f} not"
6907                        "implemented.".format(f=filter_type))
6908                    raise_with_traceback(e)
6909                with np.errstate(all='ignore'):
6910                    result = f(data.values)
6911                labels = data._get_agg_axis(axis)
6912        else:
6913            if numeric_only:
6914                if filter_type is None or filter_type == 'numeric':
6915                    data = self._get_numeric_data()
6916                elif filter_type == 'bool':
6917                    data = self._get_bool_data()
6918                else:  # pragma: no cover
6919                    msg = ("Generating numeric_only data with filter_type {f}"
6920                           "not supported.".format(f=filter_type))
6921                    raise NotImplementedError(msg)
6922                values = data.values
6923                labels = data._get_agg_axis(axis)
6924            else:
6925                values = self.values
6926            result = f(values)
6927
6928        if hasattr(result, 'dtype') and is_object_dtype(result.dtype):
6929            try:
6930                if filter_type is None or filter_type == 'numeric':
6931                    result = result.astype(np.float64)
6932                elif filter_type == 'bool' and notna(result).all():
6933                    result = result.astype(np.bool_)
6934            except (ValueError, TypeError):
6935
6936                # try to coerce to the original dtypes item by item if we can
6937                if axis == 0:
6938                    result = coerce_to_dtypes(result, self.dtypes)
6939
6940        if constructor is not None:
6941            result = Series(result, index=labels)
6942        return result
6943
6944    def nunique(self, axis=0, dropna=True):
6945        """
6946        Return Series with number of distinct observations over requested
6947        axis.
6948
6949        .. versionadded:: 0.20.0
6950
6951        Parameters
6952        ----------
6953        axis : {0 or 'index', 1 or 'columns'}, default 0
6954        dropna : boolean, default True
6955            Don't include NaN in the counts.
6956
6957        Returns
6958        -------
6959        nunique : Series
6960
6961        Examples
6962        --------
6963        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 1, 1]})
6964        >>> df.nunique()
6965        A    3
6966        B    1
6967
6968        >>> df.nunique(axis=1)
6969        0    1
6970        1    2
6971        2    2
6972        """
6973        return self.apply(Series.nunique, axis=axis, dropna=dropna)
6974
6975    def idxmin(self, axis=0, skipna=True):
6976        """
6977        Return index of first occurrence of minimum over requested axis.
6978        NA/null values are excluded.
6979
6980        Parameters
6981        ----------
6982        axis : {0 or 'index', 1 or 'columns'}, default 0
6983            0 or 'index' for row-wise, 1 or 'columns' for column-wise
6984        skipna : boolean, default True
6985            Exclude NA/null values. If an entire row/column is NA, the result
6986            will be NA.
6987
6988        Raises
6989        ------
6990        ValueError
6991            * If the row/column is empty
6992
6993        Returns
6994        -------
6995        idxmin : Series
6996
6997        Notes
6998        -----
6999        This method is the DataFrame version of ``ndarray.argmin``.
7000
7001        See Also
7002        --------
7003        Series.idxmin
7004        """
7005        axis = self._get_axis_number(axis)
7006        indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna)
7007        index = self._get_axis(axis)
7008        result = [index[i] if i >= 0 else np.nan for i in indices]
7009        return Series(result, index=self._get_agg_axis(axis))
7010
7011    def idxmax(self, axis=0, skipna=True):
7012        """
7013        Return index of first occurrence of maximum over requested axis.
7014        NA/null values are excluded.
7015
7016        Parameters
7017        ----------
7018        axis : {0 or 'index', 1 or 'columns'}, default 0
7019            0 or 'index' for row-wise, 1 or 'columns' for column-wise
7020        skipna : boolean, default True
7021            Exclude NA/null values. If an entire row/column is NA, the result
7022            will be NA.
7023
7024        Raises
7025        ------
7026        ValueError
7027            * If the row/column is empty
7028
7029        Returns
7030        -------
7031        idxmax : Series
7032
7033        Notes
7034        -----
7035        This method is the DataFrame version of ``ndarray.argmax``.
7036
7037        See Also
7038        --------
7039        Series.idxmax
7040        """
7041        axis = self._get_axis_number(axis)
7042        indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna)
7043        index = self._get_axis(axis)
7044        result = [index[i] if i >= 0 else np.nan for i in indices]
7045        return Series(result, index=self._get_agg_axis(axis))
7046
7047    def _get_agg_axis(self, axis_num):
7048        """ let's be explicit about this """
7049        if axis_num == 0:
7050            return self.columns
7051        elif axis_num == 1:
7052            return self.index
7053        else:
7054            raise ValueError('Axis must be 0 or 1 (got %r)' % axis_num)
7055
7056    def mode(self, axis=0, numeric_only=False):
7057        """
7058        Gets the mode(s) of each element along the axis selected. Adds a row
7059        for each mode per label, fills in gaps with nan.
7060
7061        Note that there could be multiple values returned for the selected
7062        axis (when more than one item share the maximum frequency), which is
7063        the reason why a dataframe is returned. If you want to impute missing
7064        values with the mode in a dataframe ``df``, you can just do this:
7065        ``df.fillna(df.mode().iloc[0])``
7066
7067        Parameters
7068        ----------
7069        axis : {0 or 'index', 1 or 'columns'}, default 0
7070            * 0 or 'index' : get mode of each column
7071            * 1 or 'columns' : get mode of each row
7072        numeric_only : boolean, default False
7073            if True, only apply to numeric columns
7074
7075        Returns
7076        -------
7077        modes : DataFrame (sorted)
7078
7079        Examples
7080        --------
7081        >>> df = pd.DataFrame({'A': [1, 2, 1, 2, 1, 2, 3]})
7082        >>> df.mode()
7083           A
7084        0  1
7085        1  2
7086        """
7087        data = self if not numeric_only else self._get_numeric_data()
7088
7089        def f(s):
7090            return s.mode()
7091
7092        return data.apply(f, axis=axis)
7093
7094    def quantile(self, q=0.5, axis=0, numeric_only=True,
7095                 interpolation='linear'):
7096        """
7097        Return values at the given quantile over requested axis, a la
7098        numpy.percentile.
7099
7100        Parameters
7101        ----------
7102        q : float or array-like, default 0.5 (50% quantile)
7103            0 <= q <= 1, the quantile(s) to compute
7104        axis : {0, 1, 'index', 'columns'} (default 0)
7105            0 or 'index' for row-wise, 1 or 'columns' for column-wise
7106        numeric_only : boolean, default True
7107            If False, the quantile of datetime and timedelta data will be
7108            computed as well
7109        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
7110            .. versionadded:: 0.18.0
7111
7112            This optional parameter specifies the interpolation method to use,
7113            when the desired quantile lies between two data points `i` and `j`:
7114
7115            * linear: `i + (j - i) * fraction`, where `fraction` is the
7116              fractional part of the index surrounded by `i` and `j`.
7117            * lower: `i`.
7118            * higher: `j`.
7119            * nearest: `i` or `j` whichever is nearest.
7120            * midpoint: (`i` + `j`) / 2.
7121
7122        Returns
7123        -------
7124        quantiles : Series or DataFrame
7125
7126            - If ``q`` is an array, a DataFrame will be returned where the
7127              index is ``q``, the columns are the columns of self, and the
7128              values are the quantiles.
7129            - If ``q`` is a float, a Series will be returned where the
7130              index is the columns of self and the values are the quantiles.
7131
7132        Examples
7133        --------
7134
7135        >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
7136                              columns=['a', 'b'])
7137        >>> df.quantile(.1)
7138        a    1.3
7139        b    3.7
7140        dtype: float64
7141        >>> df.quantile([.1, .5])
7142               a     b
7143        0.1  1.3   3.7
7144        0.5  2.5  55.0
7145
7146        Specifying `numeric_only=False` will also compute the quantile of
7147        datetime and timedelta data.
7148
7149        >>> df = pd.DataFrame({'A': [1, 2],
7150                               'B': [pd.Timestamp('2010'),
7151                                     pd.Timestamp('2011')],
7152                               'C': [pd.Timedelta('1 days'),
7153                                     pd.Timedelta('2 days')]})
7154        >>> df.quantile(0.5, numeric_only=False)
7155        A                    1.5
7156        B    2010-07-02 12:00:00
7157        C        1 days 12:00:00
7158        Name: 0.5, dtype: object
7159
7160        See Also
7161        --------
7162        pandas.core.window.Rolling.quantile
7163        """
7164        self._check_percentile(q)
7165
7166        data = self._get_numeric_data() if numeric_only else self
7167        axis = self._get_axis_number(axis)
7168        is_transposed = axis == 1
7169
7170        if is_transposed:
7171            data = data.T
7172
7173        result = data._data.quantile(qs=q,
7174                                     axis=1,
7175                                     interpolation=interpolation,
7176                                     transposed=is_transposed)
7177
7178        if result.ndim == 2:
7179            result = self._constructor(result)
7180        else:
7181            result = self._constructor_sliced(result, name=q)
7182
7183        if is_transposed:
7184            result = result.T
7185
7186        return result
7187
7188    def to_timestamp(self, freq=None, how='start', axis=0, copy=True):
7189        """
7190        Cast to DatetimeIndex of timestamps, at *beginning* of period
7191
7192        Parameters
7193        ----------
7194        freq : string, default frequency of PeriodIndex
7195            Desired frequency
7196        how : {'s', 'e', 'start', 'end'}
7197            Convention for converting period to timestamp; start of period
7198            vs. end
7199        axis : {0 or 'index', 1 or 'columns'}, default 0
7200            The axis to convert (the index by default)
7201        copy : boolean, default True
7202            If false then underlying input data is not copied
7203
7204        Returns
7205        -------
7206        df : DataFrame with DatetimeIndex
7207        """
7208        new_data = self._data
7209        if copy:
7210            new_data = new_data.copy()
7211
7212        axis = self._get_axis_number(axis)
7213        if axis == 0:
7214            new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how))
7215        elif axis == 1:
7216            new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how))
7217        else:  # pragma: no cover
7218            raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
7219                ax=axis))
7220
7221        return self._constructor(new_data)
7222
7223    def to_period(self, freq=None, axis=0, copy=True):
7224        """
7225        Convert DataFrame from DatetimeIndex to PeriodIndex with desired
7226        frequency (inferred from index if not passed)
7227
7228        Parameters
7229        ----------
7230        freq : string, default
7231        axis : {0 or 'index', 1 or 'columns'}, default 0
7232            The axis to convert (the index by default)
7233        copy : boolean, default True
7234            If False then underlying input data is not copied
7235
7236        Returns
7237        -------
7238        ts : TimeSeries with PeriodIndex
7239        """
7240        new_data = self._data
7241        if copy:
7242            new_data = new_data.copy()
7243
7244        axis = self._get_axis_number(axis)
7245        if axis == 0:
7246            new_data.set_axis(1, self.index.to_period(freq=freq))
7247        elif axis == 1:
7248            new_data.set_axis(0, self.columns.to_period(freq=freq))
7249        else:  # pragma: no cover
7250            raise AssertionError('Axis must be 0 or 1. Got {ax!s}'.format(
7251                ax=axis))
7252
7253        return self._constructor(new_data)
7254
7255    def isin(self, values):
7256        """
7257        Return boolean DataFrame showing whether each element in the
7258        DataFrame is contained in values.
7259
7260        Parameters
7261        ----------
7262        values : iterable, Series, DataFrame or dictionary
7263            The result will only be true at a location if all the
7264            labels match. If `values` is a Series, that's the index. If
7265            `values` is a dictionary, the keys must be the column names,
7266            which must match. If `values` is a DataFrame,
7267            then both the index and column labels must match.
7268
7269        Returns
7270        -------
7271
7272        DataFrame of booleans
7273
7274        Examples
7275        --------
7276        When ``values`` is a list:
7277
7278        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
7279        >>> df.isin([1, 3, 12, 'a'])
7280               A      B
7281        0   True   True
7282        1  False  False
7283        2   True  False
7284
7285        When ``values`` is a dict:
7286
7287        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
7288        >>> df.isin({'A': [1, 3], 'B': [4, 7, 12]})
7289               A      B
7290        0   True  False  # Note that B didn't match the 1 here.
7291        1  False   True
7292        2   True   True
7293
7294        When ``values`` is a Series or DataFrame:
7295
7296        >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
7297        >>> other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
7298        >>> df.isin(other)
7299               A      B
7300        0   True  False
7301        1  False  False  # Column A in `other` has a 3, but not at index 1.
7302        2   True   True
7303        """
7304        if isinstance(values, dict):
7305            from pandas.core.reshape.concat import concat
7306            values = collections.defaultdict(list, values)
7307            return concat((self.iloc[:, [i]].isin(values[col])
7308                           for i, col in enumerate(self.columns)), axis=1)
7309        elif isinstance(values, Series):
7310            if not values.index.is_unique:
7311                raise ValueError("cannot compute isin with "
7312                                 "a duplicate axis.")
7313            return self.eq(values.reindex_like(self), axis='index')
7314        elif isinstance(values, DataFrame):
7315            if not (values.columns.is_unique and values.index.is_unique):
7316                raise ValueError("cannot compute isin with "
7317                                 "a duplicate axis.")
7318            return self.eq(values.reindex_like(self))
7319        else:
7320            if not is_list_like(values):
7321                raise TypeError("only list-like or dict-like objects are "
7322                                "allowed to be passed to DataFrame.isin(), "
7323                                "you passed a "
7324                                "{0!r}".format(type(values).__name__))
7325            return DataFrame(
7326                algorithms.isin(self.values.ravel(),
7327                                values).reshape(self.shape), self.index,
7328                self.columns)
7329
7330    # ----------------------------------------------------------------------
7331    # Add plotting methods to DataFrame
7332    plot = CachedAccessor("plot", gfx.FramePlotMethods)
7333    hist = gfx.hist_frame
7334    boxplot = gfx.boxplot_frame
7335
7336
7337DataFrame._setup_axes(['index', 'columns'], info_axis=1, stat_axis=0,
7338                      axes_are_reversed=True, aliases={'rows': 0},
7339                      docs={
7340                          'index': 'The index (row labels) of the DataFrame.',
7341                          'columns': 'The column labels of the DataFrame.'})
7342DataFrame._add_numeric_operations()
7343DataFrame._add_series_or_dataframe_operations()
7344
7345ops.add_flex_arithmetic_methods(DataFrame)
7346ops.add_special_arithmetic_methods(DataFrame)
7347
7348
7349def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
7350    """
7351    Segregate Series based on type and coerce into matrices.
7352    Needs to handle a lot of exceptional cases.
7353    """
7354    # figure out the index, if necessary
7355    if index is None:
7356        index = extract_index(arrays)
7357
7358    # don't force copy because getting jammed in an ndarray anyway
7359    arrays = _homogenize(arrays, index, dtype)
7360
7361    # from BlockManager perspective
7362    axes = [_ensure_index(columns), _ensure_index(index)]
7363
7364    return create_block_manager_from_arrays(arrays, arr_names, axes)
7365
7366
7367def extract_index(data):
7368    from pandas.core.index import _union_indexes
7369
7370    index = None
7371    if len(data) == 0:
7372        index = Index([])
7373    elif len(data) > 0:
7374        raw_lengths = []
7375        indexes = []
7376
7377        have_raw_arrays = False
7378        have_series = False
7379        have_dicts = False
7380
7381        for v in data:
7382            if isinstance(v, Series):
7383                have_series = True
7384                indexes.append(v.index)
7385            elif isinstance(v, dict):
7386                have_dicts = True
7387                indexes.append(list(v.keys()))
7388            elif is_list_like(v) and getattr(v, 'ndim', 1) == 1:
7389                have_raw_arrays = True
7390                raw_lengths.append(len(v))
7391
7392        if not indexes and not raw_lengths:
7393            raise ValueError('If using all scalar values, you must pass'
7394                             ' an index')
7395
7396        if have_series or have_dicts:
7397            index = _union_indexes(indexes)
7398
7399        if have_raw_arrays:
7400            lengths = list(set(raw_lengths))
7401            if len(lengths) > 1:
7402                raise ValueError('arrays must all be same length')
7403
7404            if have_dicts:
7405                raise ValueError('Mixing dicts with non-Series may lead to '
7406                                 'ambiguous ordering.')
7407
7408            if have_series:
7409                if lengths[0] != len(index):
7410                    msg = ('array length %d does not match index length %d' %
7411                           (lengths[0], len(index)))
7412                    raise ValueError(msg)
7413            else:
7414                index = com._default_index(lengths[0])
7415
7416    return _ensure_index(index)
7417
7418
7419def _prep_ndarray(values, copy=True):
7420    if not isinstance(values, (np.ndarray, Series, Index)):
7421        if len(values) == 0:
7422            return np.empty((0, 0), dtype=object)
7423
7424        def convert(v):
7425            return maybe_convert_platform(v)
7426
7427        # we could have a 1-dim or 2-dim list here
7428        # this is equiv of np.asarray, but does object conversion
7429        # and platform dtype preservation
7430        try:
7431            if is_list_like(values[0]) or hasattr(values[0], 'len'):
7432                values = np.array([convert(v) for v in values])
7433            else:
7434                values = convert(values)
7435        except:
7436            values = convert(values)
7437
7438    else:
7439
7440        # drop subclass info, do not copy data
7441        values = np.asarray(values)
7442        if copy:
7443            values = values.copy()
7444
7445    if values.ndim == 1:
7446        values = values.reshape((values.shape[0], 1))
7447    elif values.ndim != 2:
7448        raise ValueError('Must pass 2-d input')
7449
7450    return values
7451
7452
7453def _to_arrays(data, columns, coerce_float=False, dtype=None):
7454    """
7455    Return list of arrays, columns
7456    """
7457    if isinstance(data, DataFrame):
7458        if columns is not None:
7459            arrays = [data._ixs(i, axis=1).values
7460                      for i, col in enumerate(data.columns) if col in columns]
7461        else:
7462            columns = data.columns
7463            arrays = [data._ixs(i, axis=1).values for i in range(len(columns))]
7464
7465        return arrays, columns
7466
7467    if not len(data):
7468        if isinstance(data, np.ndarray):
7469            columns = data.dtype.names
7470            if columns is not None:
7471                return [[]] * len(columns), columns
7472        return [], []  # columns if columns is not None else []
7473    if isinstance(data[0], (list, tuple)):
7474        return _list_to_arrays(data, columns, coerce_float=coerce_float,
7475                               dtype=dtype)
7476    elif isinstance(data[0], collections.Mapping):
7477        return _list_of_dict_to_arrays(data, columns,
7478                                       coerce_float=coerce_float, dtype=dtype)
7479    elif isinstance(data[0], Series):
7480        return _list_of_series_to_arrays(data, columns,
7481                                         coerce_float=coerce_float,
7482                                         dtype=dtype)
7483    elif isinstance(data[0], Categorical):
7484        if columns is None:
7485            columns = com._default_index(len(data))
7486        return data, columns
7487    elif (isinstance(data, (np.ndarray, Series, Index)) and
7488          data.dtype.names is not None):
7489
7490        columns = list(data.dtype.names)
7491        arrays = [data[k] for k in columns]
7492        return arrays, columns
7493    else:
7494        # last ditch effort
7495        data = lmap(tuple, data)
7496        return _list_to_arrays(data, columns, coerce_float=coerce_float,
7497                               dtype=dtype)
7498
7499
7500def _masked_rec_array_to_mgr(data, index, columns, dtype, copy):
7501    """ extract from a masked rec array and create the manager """
7502
7503    # essentially process a record array then fill it
7504    fill_value = data.fill_value
7505    fdata = ma.getdata(data)
7506    if index is None:
7507        index = _get_names_from_index(fdata)
7508        if index is None:
7509            index = com._default_index(len(data))
7510    index = _ensure_index(index)
7511
7512    if columns is not None:
7513        columns = _ensure_index(columns)
7514    arrays, arr_columns = _to_arrays(fdata, columns)
7515
7516    # fill if needed
7517    new_arrays = []
7518    for fv, arr, col in zip(fill_value, arrays, arr_columns):
7519        mask = ma.getmaskarray(data[col])
7520        if mask.any():
7521            arr, fv = maybe_upcast(arr, fill_value=fv, copy=True)
7522            arr[mask] = fv
7523        new_arrays.append(arr)
7524
7525    # create the manager
7526    arrays, arr_columns = _reorder_arrays(new_arrays, arr_columns, columns)
7527    if columns is None:
7528        columns = arr_columns
7529
7530    mgr = _arrays_to_mgr(arrays, arr_columns, index, columns)
7531
7532    if copy:
7533        mgr = mgr.copy()
7534    return mgr
7535
7536
7537def _reorder_arrays(arrays, arr_columns, columns):
7538    # reorder according to the columns
7539    if (columns is not None and len(columns) and arr_columns is not None and
7540            len(arr_columns)):
7541        indexer = _ensure_index(arr_columns).get_indexer(columns)
7542        arr_columns = _ensure_index([arr_columns[i] for i in indexer])
7543        arrays = [arrays[i] for i in indexer]
7544    return arrays, arr_columns
7545
7546
7547def _list_to_arrays(data, columns, coerce_float=False, dtype=None):
7548    if len(data) > 0 and isinstance(data[0], tuple):
7549        content = list(lib.to_object_array_tuples(data).T)
7550    else:
7551        # list of lists
7552        content = list(lib.to_object_array(data).T)
7553    return _convert_object_array(content, columns, dtype=dtype,
7554                                 coerce_float=coerce_float)
7555
7556
7557def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
7558    from pandas.core.index import _get_objs_combined_axis
7559
7560    if columns is None:
7561        columns = _get_objs_combined_axis(data, sort=False)
7562
7563    indexer_cache = {}
7564
7565    aligned_values = []
7566    for s in data:
7567        index = getattr(s, 'index', None)
7568        if index is None:
7569            index = com._default_index(len(s))
7570
7571        if id(index) in indexer_cache:
7572            indexer = indexer_cache[id(index)]
7573        else:
7574            indexer = indexer_cache[id(index)] = index.get_indexer(columns)
7575
7576        values = com._values_from_object(s)
7577        aligned_values.append(algorithms.take_1d(values, indexer))
7578
7579    values = np.vstack(aligned_values)
7580
7581    if values.dtype == np.object_:
7582        content = list(values.T)
7583        return _convert_object_array(content, columns, dtype=dtype,
7584                                     coerce_float=coerce_float)
7585    else:
7586        return values.T, columns
7587
7588
7589def _list_of_dict_to_arrays(data, columns, coerce_float=False, dtype=None):
7590    if columns is None:
7591        gen = (list(x.keys()) for x in data)
7592        sort = not any(isinstance(d, OrderedDict) for d in data)
7593        columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
7594
7595    # assure that they are of the base dict class and not of derived
7596    # classes
7597    data = [(type(d) is dict) and d or dict(d) for d in data]
7598
7599    content = list(lib.dicts_to_array(data, list(columns)).T)
7600    return _convert_object_array(content, columns, dtype=dtype,
7601                                 coerce_float=coerce_float)
7602
7603
7604def _convert_object_array(content, columns, coerce_float=False, dtype=None):
7605    if columns is None:
7606        columns = com._default_index(len(content))
7607    else:
7608        if len(columns) != len(content):  # pragma: no cover
7609            # caller's responsibility to check for this...
7610            raise AssertionError('{col:d} columns passed, passed data had '
7611                                 '{con} columns'.format(col=len(columns),
7612                                                        con=len(content)))
7613
7614    # provide soft conversion of object dtypes
7615    def convert(arr):
7616        if dtype != object and dtype != np.object:
7617            arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
7618            arr = maybe_cast_to_datetime(arr, dtype)
7619        return arr
7620
7621    arrays = [convert(arr) for arr in content]
7622
7623    return arrays, columns
7624
7625
7626def _get_names_from_index(data):
7627    has_some_name = any(getattr(s, 'name', None) is not None for s in data)
7628    if not has_some_name:
7629        return com._default_index(len(data))
7630
7631    index = lrange(len(data))
7632    count = 0
7633    for i, s in enumerate(data):
7634        n = getattr(s, 'name', None)
7635        if n is not None:
7636            index[i] = n
7637        else:
7638            index[i] = 'Unnamed %d' % count
7639            count += 1
7640
7641    return index
7642
7643
7644def _homogenize(data, index, dtype=None):
7645    from pandas.core.series import _sanitize_array
7646
7647    oindex = None
7648    homogenized = []
7649
7650    for v in data:
7651        if isinstance(v, Series):
7652            if dtype is not None:
7653                v = v.astype(dtype)
7654            if v.index is not index:
7655                # Forces alignment. No need to copy data since we
7656                # are putting it into an ndarray later
7657                v = v.reindex(index, copy=False)
7658        else:
7659            if isinstance(v, dict):
7660                if oindex is None:
7661                    oindex = index.astype('O')
7662
7663                if isinstance(index, (DatetimeIndex, TimedeltaIndex)):
7664                    v = com._dict_compat(v)
7665                else:
7666                    v = dict(v)
7667                v = lib.fast_multiget(v, oindex.values, default=np.nan)
7668            v = _sanitize_array(v, index, dtype=dtype, copy=False,
7669                                raise_cast_failure=False)
7670
7671        homogenized.append(v)
7672
7673    return homogenized
7674
7675
7676def _from_nested_dict(data):
7677    # TODO: this should be seriously cythonized
7678    new_data = OrderedDict()
7679    for index, s in compat.iteritems(data):
7680        for col, v in compat.iteritems(s):
7681            new_data[col] = new_data.get(col, OrderedDict())
7682            new_data[col][index] = v
7683    return new_data
7684
7685
7686def _put_str(s, space):
7687    return u'{s}'.format(s=s)[:space].ljust(space)