Skip to content

Latest commit

 

History

History
4384 lines (3705 loc) · 76.7 KB

task03.md

File metadata and controls

4384 lines (3705 loc) · 76.7 KB

一、索引器

1.表的列索引

import numpy as np
import pandas as pd
df = pd.read_csv(r"C:\Users\zhoukaiwei\Desktop\joyful-pandas\data\learn_pandas.csv",
                 usecols = ['School', 'Grade', 'Name', 'Gender',
                              'Weight', 'Transfer'])
df['Name'].head()
0      Gaopeng Yang
1    Changqiang You
2           Mei Sun
3      Xiaojuan Sun
4       Gaojuan You
Name: Name, dtype: object
df[['Gender','Name']].head()#获得多个列
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Gender Name
0 Female Gaopeng Yang
1 Male Changqiang You
2 Male Mei Sun
3 Female Xiaojuan Sun
4 Male Gaojuan You
df.Name.head()#获得单列,且列名中不包空和上面df['Name']是一样的
0      Gaopeng Yang
1    Changqiang You
2           Mei Sun
3      Xiaojuan Sun
4       Gaojuan You
Name: Name, dtype: object

2.序列的行索引

以字符串为索引的 Series

A = pd.Series([1,2,3,4,5,6],index=['a','b','a','a','a','c'])
A['a']
a    1
a    3
a    4
dtype: int64
#如果取出多个索引的对应元素,则可以使用 [items的列表] :
A[['c','b']]
c    6
b    5
dtype: int64
A = pd.Series([1,2,3,4,5,6],index=['a','b','a','a','a','c'])
A['c': 'b': -1]#获得两个索引之间的元素
c    6
a    5
a    4
a    3
b    2
dtype: int64

3. loc索引器

前面讲到了对 DataFrame 的列进行选取,下面要讨论其行的选取。对于表而言,有两种索引器,一种是基于 元素 的 loc 索引器,另一种是基于 位置 的 iloc 索引器。loc 索引器的一般形式是 loc[*, ] ,其中第一个 * 代表行的选择,第二个 * 代表列的选择,如果省略第二个位置写作 loc[] ,这个 * 是指行的筛选。其中, * 的位置一共有五类合法对象,分别是:单个元素、元素列表、元素切片、布尔列表以及函数,下面将依次说明。 为了演示相应操作,先利用 set_index 方法把 Name 列设为索引,关于该函数的其他用法将在多级索引一章介绍。

import numpy as np
import pandas as pd
df = pd.read_csv(r"C:\Users\zhoukaiwei\Desktop\joyful-pandas\data\learn_pandas.csv",
                 usecols = ['School', 'Grade', 'Name', 'Gender',
                              'Weight', 'Transfer'])
df_A =df.set_index('Name')
df_A.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Gender Weight Transfer
Name
Gaopeng Yang Shanghai Jiao Tong University Freshman Female 46.0 N
Changqiang You Peking University Freshman Male 70.0 N
Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N
Xiaojuan Sun Fudan University Sophomore Female 41.0 N
Gaojuan You Fudan University Sophomore Male 74.0 N
  • 为单个元素
df_A.loc['Qiang Sun']
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Gender Weight Transfer
Name
Qiang Sun Tsinghua University Junior Female 53.0 N
Qiang Sun Tsinghua University Sophomore Female 40.0 N
Qiang Sun Shanghai Jiao Tong University Junior Female NaN N
df_A.loc['Quan Zhao']
School      Shanghai Jiao Tong University
Grade                              Junior
Gender                             Female
Weight                                 53
Transfer                                N
Name: Quan Zhao, dtype: object
df_A.loc['Qiang Sun','School']
Name
Qiang Sun              Tsinghua University
Qiang Sun              Tsinghua University
Qiang Sun    Shanghai Jiao Tong University
Name: School, dtype: object
  • 为元素列表
#取出列表中所有元素值对应的行或列:
df_A.loc[['Qiang Sun','Quan Zhao'],['School','Gender']]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Gender
Name
Qiang Sun Tsinghua University Female
Qiang Sun Tsinghua University Female
Qiang Sun Shanghai Jiao Tong University Female
Quan Zhao Shanghai Jiao Tong University Female
  • 为切片
df_A.loc['Gaojuan You':'Gaoqiang Qian','School':'Gender']#使用切片
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Gender
Name
Gaojuan You Fudan University Sophomore Male
Xiaoli Qian Tsinghua University Freshman Female
Qiang Chu Shanghai Jiao Tong University Freshman Female
Gaoqiang Qian Tsinghua University Junior Female
  • 为布尔列表

在实际的数据处理中,根据条件来筛选行是极其常见的,此处传入 loc 的布尔列表与 DataFrame 长度相同,且列表为 True 的位置所对应的行会被选中, False 则会被剔除。

df_A.loc[df_A.Weight>70].head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Gender Weight Transfer
Name
Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N
Gaojuan You Fudan University Sophomore Male 74.0 N
Xiaopeng Zhou Shanghai Jiao Tong University Freshman Male 74.0 N
Xiaofeng Sun Tsinghua University Senior Male 71.0 N
Qiang Zheng Shanghai Jiao Tong University Senior Male 87.0 N
df_A.loc[df_A.Grade.isin(['Freshman', 'Senior'])].head()#获得大一和大四的学生
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Gender Weight Transfer
Name
Gaopeng Yang Shanghai Jiao Tong University Freshman Female 46.0 N
Changqiang You Peking University Freshman Male 70.0 N
Mei Sun Shanghai Jiao Tong University Senior Male 89.0 N
Xiaoli Qian Tsinghua University Freshman Female 51.0 N
Qiang Chu Shanghai Jiao Tong University Freshman Female 52.0 N
  • 为函数
def condition(x):
        condition_1_1 = x.School == 'Fudan University'
        condition_1_2 = x.Grade == 'Senior'
        condition_1_3 = x.Weight > 70
        condition_1 = condition_1_1 & condition_1_2 & condition_1_3
        condition_2_1 = x.School == 'Peking University'
        condition_2_2 = x.Grade == 'Senior'
        condition_2_3 = x.Weight > 80
        condition_2 = condition_2_1 & (~condition_2_2) & condition_2_3
        result = condition_1 | condition_2
        return result
df_A.loc[condition]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Gender Weight Transfer
Name
Qiang Han Peking University Freshman Male 87.0 N
Chengpeng Zhou Fudan University Senior Male 81.0 N
Changpeng Zhao Peking University Freshman Male 83.0 N
Chengpeng Qian Fudan University Senior Male 73.0 Y
#由于函数无法返回如 start: end: step 的切片形式,故返回切片时要用 slice 对象进行包装:
df_A.loc[lambda x: slice('Gaojuan You','Gaoqiang Qian')]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Gender Weight Transfer
Name
Gaojuan You Fudan University Sophomore Male 74.0 N
Xiaoli Qian Tsinghua University Freshman Female 51.0 N
Qiang Chu Shanghai Jiao Tong University Freshman Female 52.0 N
Gaoqiang Qian Tsinghua University Junior Female 50.0 N

4. iloc索引器

iloc 的使用与 loc 完全类似,只不过是针对位置进行筛选,在相应的 * 位置处一共也有五类 合法对象,分别是:整数、整数列表、整数切片、布尔列表以及函数,函数的返回值必须是前面 的四类合法对象中的一个,其输入同样也为 DataFrame 本身。

df_A.iloc[1, 1] # 第二行第二列
'Freshman'
df_A.iloc[[0,1],[0,1]]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade
Name
Gaopeng Yang Shanghai Jiao Tong University Freshman
Changqiang You Peking University Freshman
df_A.iloc[1: 4, 2:4] # 切片不包含结束端点
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Gender Weight
Name
Changqiang You Male 70.0
Mei Sun Male 89.0
Xiaojuan Sun Female 41.0

5. query方法

#在 pandas 中,支持把字符串形式的查询表达式传入 query 方法来查询数据,其表
#达式的执行结果必须返回布尔列表
df.query('((School == "Fudan University")&(Grade == "Senior")&(Weight > 70))')
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Name Gender Weight Transfer
66 Fudan University Senior Chengpeng Zhou Male 81.0 N
131 Fudan University Senior Chengpeng Qian Male 73.0 Y
df.query('Grade == ["Junior", "Senior"]').head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
School Grade Name Gender Weight Transfer
2 Shanghai Jiao Tong University Senior Mei Sun Male 89.0 N
7 Tsinghua University Junior Gaoqiang Qian Female 50.0 N
9 Peking University Junior Juan Xu Female NaN N
11 Tsinghua University Junior Xiaoquan Lv Female 43.0 N
12 Shanghai Jiao Tong University Senior Peng You Female 48.0 NaN
low, high =70, 80
df.query('Weight.between(@low, @high)').head()
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-73-7b9719bb38f0> in <module>
      1 low, high =70, 80
----> 2 df.query('Weight.between(@low, @high)').head()


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in query(self, expr, inplace, **kwargs)
   3343         kwargs["level"] = kwargs.pop("level", 0) + 1
   3344         kwargs["target"] = None
-> 3345         res = self.eval(expr, **kwargs)
   3346 
   3347         try:


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in eval(self, expr, inplace, **kwargs)
   3473         kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers)
   3474 
-> 3475         return _eval(expr, inplace=inplace, **kwargs)
   3476 
   3477     def select_dtypes(self, include=None, exclude=None) -> "DataFrame":


D:\Anaconda3\lib\site-packages\pandas\core\computation\eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
    344         eng = _engines[engine]
    345         eng_inst = eng(parsed_expr)
--> 346         ret = eng_inst.evaluate()
    347 
    348         if parsed_expr.assigner is None:


D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in evaluate(self)
     71 
     72         # make sure no names in resolvers and locals/globals clash
---> 73         res = self._evaluate()
     74         return reconstruct_object(
     75             self.result_type, res, self.aligned_axes, self.expr.terms.return_type


D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in _evaluate(self)
    111         env = self.expr.env
    112         scope = env.full_scope
--> 113         _check_ne_builtin_clash(self.expr)
    114         return ne.evaluate(s, local_dict=scope)
    115 


D:\Anaconda3\lib\site-packages\pandas\core\computation\engines.py in _check_ne_builtin_clash(expr)
     27         Terms can contain
     28     """
---> 29     names = expr.names
     30     overlap = names & _ne_builtins
     31 


D:\Anaconda3\lib\site-packages\pandas\core\computation\expr.py in names(self)
    812         """
    813         if is_term(self.terms):
--> 814             return frozenset([self.terms.name])
    815         return frozenset(term.name for term in com.flatten(self.terms))
    816 


D:\Anaconda3\lib\site-packages\pandas\core\generic.py in __hash__(self)
   1667     def __hash__(self):
   1668         raise TypeError(
-> 1669             f"{repr(type(self).__name__)} objects are mutable, "
   1670             f"thus they cannot be hashed"
   1671         )


TypeError: 'Series' objects are mutable, thus they cannot be hashed

6. 随机抽样

df_A = pd.DataFrame({'id': list('abcde'),
                          'value': [10, 20, 30, 20, 20]})
df_A
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id value
0 a 10
1 b 20
2 c 30
3 d 20
4 e 20
df_A.sample(3,replace = True,weights = df_A.value)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id value
3 d 20
2 c 30
4 e 20

二、多级索引

1. 多级索引及其表的结构

import numpy as np
import pandas as pd
np.random.seed(0)
multi_index = pd.MultiIndex.from_product([list('ABCD'),
               df.Gender.unique()], names=('School', 'Gender'))#重新建表
multi_column = pd.MultiIndex.from_product([['Height', 'Weight'],
                   df.Grade.unique()], names=('Indicator', 'Grade'))
df_multi = pd.DataFrame(np.c_[(np.random.randn(8,4)*5 + 163).tolist(),
                            (np.random.randn(8,4)*5 + 65).tolist()],
                            index = multi_index,
                            columns = multi_column).round(1)
df_multi
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Indicator Height Weight
Grade Freshman Senior Sophomore Junior Freshman Senior Sophomore Junior
School Gender
A Female 171.8 165.0 167.9 174.2 60.6 55.1 63.3 65.8
Male 172.3 158.1 167.8 162.2 71.2 71.0 63.1 63.5
B Female 162.5 165.1 163.7 170.3 59.8 57.9 56.5 74.8
Male 166.8 163.6 165.2 164.7 62.5 62.8 58.7 68.9
C Female 170.5 162.0 164.6 158.7 56.9 63.9 60.5 66.9
Male 150.2 166.3 167.3 159.3 62.4 59.1 64.9 67.1
D Female 174.3 155.7 163.2 162.1 65.3 66.5 61.8 63.2
Male 170.7 170.3 163.8 164.9 61.6 63.2 60.9 56.4
df_multi.index.names
FrozenList(['School', 'Gender'])
df_multi.columns.names
FrozenList(['Indicator', 'Grade'])
df_multi.index.get_level_values(0)#获得某一层的索引
Index(['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], dtype='object', name='School')

2. 多级索引中的loc索引器

熟悉了结构后,现在回到原表,将学校和年级设为索引,此时的行为多级索引,列为单级索引, 由于默认状态的列索引不含名字,因此对应于刚刚图中 Indicator 和 Grade 的索引名位置是空缺的。

df_multi = df.set_index(['School','Grade'])
df_multi.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Name Gender Weight Transfer
School Grade
Shanghai Jiao Tong University Freshman Gaopeng Yang Female 46.0 N
Peking University Freshman Changqiang You Male 70.0 N
Shanghai Jiao Tong University Senior Mei Sun Male 89.0 N
Fudan University Sophomore Xiaojuan Sun Female 41.0 N
Sophomore Gaojuan You Male 74.0 N

由于多级索引中的单个元素以元组为单位,因此之前在第一节介绍的 loc 和 iloc 方法完全可以照搬 ,只需把标量的位置替换成对应的元组,不过在索引前最好对 MultiIndex 进行排序以避免性能警告:

df_multi = df_multi.sort_index()
df_multi.loc[('Fudan University','Junior')].head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Name Gender Weight Transfer
School Grade
Fudan University Junior Yanli You Female 48.0 N
Junior Chunqiang Chu Male 72.0 N
Junior Changfeng Lv Male 76.0 N
Junior Yanjuan Lv Female 49.0 NaN
Junior Gaoqiang Zhou Female 43.0 N
df_multi.loc[[('Fudan University', 'Senior'),
                  ('Shanghai Jiao Tong University', 'Freshman')]].head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Name Gender Weight Transfer
School Grade
Fudan University Senior Chengpeng Zheng Female 38.0 N
Senior Feng Zhou Female 47.0 N
Senior Gaomei Lv Female 34.0 N
Senior Chunli Lv Female 56.0 N
Senior Chengpeng Zhou Male 81.0 N

3. IndexSlice对象

前面介绍的方法,即使在索引不重复的时候,也只能对元组整体进行切片,而不能对每层进行切片, 也不允许将切片和布尔列表混合使用,引入 IndexSlice 对象就能解决这个问题。 Slice 对象一共 有两种形式,第一种为 loc[idx[,]] 型,第二种为 loc[idx[,],idx[,]] 型, 下面将进行介绍。为了方便演示,下面构造一个 索引不重复的 DataFrame :

np.random.seed(0)
L1,L2 = ['A','B','C'],['a','b','c']
mul_index1 = pd.MultiIndex.from_product([L1,L2],names=('Upper', 'Lower'))
L3,L4 = ['D','E','F'],['d','e','f']
mul_index2 = pd.MultiIndex.from_product([L3,L4],names=('Big', 'Small'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(9,9)),index=mul_index1,columns=mul_index2)
df_ex
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big D E F
Small d e f d e f d e f
Upper Lower
A a 3 6 -9 -6 -6 -2 0 9 -5
b -3 3 -8 -3 -2 5 8 -4 4
c -1 0 7 -4 6 6 -9 9 -6
B a 8 5 -2 -9 -8 0 -9 1 -6
b 2 9 -7 -9 -9 -5 -4 -3 -1
c 8 6 -5 0 1 -8 -8 -2 0
C a -6 -3 2 5 9 -9 5 -6 3
b 1 2 -5 -3 -5 6 -6 3 -5
c -1 5 6 -6 6 4 7 8 -4
index = pd.IndexSlice#定义slice对象
df_ex.loc[index['C':,('D','e'):]]#loc[idx[*,*]] 型
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big D E F
Small e f d e f d e f
Upper Lower
C a -3 2 5 9 -9 5 -6 3
b 2 -5 -3 -5 6 -6 3 -5
c 5 6 -6 6 4 7 8 -4
df_ex.loc[index[:'A', lambda x:x.sum()>0]] # 列和大于0
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big D F
Small d e e
Upper Lower
A a 3 6 9
b -3 3 -4
c -1 0 9
df_ex.loc[index[:'A', 'b':], index['E':, 'e':]]#loc[idx[*,*],idx[*,*]] 型前一个 idx 指代的是行索引,后一个是列索引。
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big E F
Small e f e f
Upper Lower
A b -2 5 -4 4
c 6 6 9 -6

4. 多级索引的构造

前面提到了多级索引表的结构和切片,那么除了使用 set_index 之外, 如何自己构造多级索引呢?常用的有 from_tuples, from_arrays, from_product 三种方法, 它们都是 pd.MultiIndex 对象下的函数。

#from_tuples 指根据传入由元组组成的列表进行构造:
my_A = [('a','cat'),('a','dog'),('b','cat'),('b','dog')]
pd.MultiIndex.from_tuples(my_A,names = ['First','Second'])
MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])
#from_arrays 根据传入列表中,对应层的列表进行构造:
my_array = [list('aabb'), ['cat', 'dog']*2]
pd.MultiIndex.from_arrays(my_array, names=['First','Second'])
MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])
#from_product 指根据给定多个列表的笛卡尔积进行构造:
list1 = ['a','b']
list2 = ['cat','dog']
pd.MultiIndex.from_product([list1,list2],names = ['First','Second'])
MultiIndex([('a', 'cat'),
            ('a', 'dog'),
            ('b', 'cat'),
            ('b', 'dog')],
           names=['First', 'Second'])

三、索引的常用方法

1. 索引层的交换和删除

np.random.seed(0)
L1,L2,L3 = ['A','B'],['a','b'],['alpha','beta']
m_index1 = pd.MultiIndex.from_product([L1,L2,L3],names = ('Upper','Lower','Extra'))
L4,L5,L6 = ['C','D'],['c','d'],['cat','dog']
m_index2 = pd.MultiIndex.from_product([L4,L5,L6],names=('Big', 'Small', 'Other'))
df_ex = pd.DataFrame(np.random.randint(-9,10,(8,8)),index=m_index1,columns=m_index2)
df_ex

 
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big C D
Small c d c d
Other cat dog cat dog cat dog cat dog
Upper Lower Extra
A a alpha 3 6 -9 -6 -6 -2 0 9
beta -5 -3 3 -8 -3 -2 5 8
b alpha -4 4 -1 0 7 -4 6 6
beta -9 9 -6 8 5 -2 -9 -8
B a alpha 0 -9 1 -6 2 9 -7 -9
beta -9 -5 -4 -3 -1 8 6 -5
b alpha 0 1 -8 -8 -2 0 -6 -3
beta 2 5 9 -9 5 -6 3 1
df_ex.swaplevel(0,2,axis=1).head() # 列索引的第一层和第三层交换
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Other cat dog cat dog cat dog cat dog
Small c c d d c c d d
Big C C C C D D D D
Upper Lower Extra
A a alpha 3 6 -9 -6 -6 -2 0 9
beta -5 -3 3 -8 -3 -2 5 8
b alpha -4 4 -1 0 7 -4 6 6
beta -9 9 -6 8 5 -2 -9 -8
B a alpha 0 -9 1 -6 2 9 -7 -9
 df_ex.reorder_levels([2,0,1],axis=0).head() # 列表数字指代原来索引中的层
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big C D
Small c d c d
Other cat dog cat dog cat dog cat dog
Extra Upper Lower
alpha A a 3 6 -9 -6 -6 -2 0 9
beta A a -5 -3 3 -8 -3 -2 5 8
alpha A b -4 4 -1 0 7 -4 6 6
beta A b -9 9 -6 8 5 -2 -9 -8
alpha B a 0 -9 1 -6 2 9 -7 -9
df_ex.droplevel(1,axis=1)#删除某一层的索引
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big C D
Other cat dog cat dog cat dog cat dog
Upper Lower Extra
A a alpha 3 6 -9 -6 -6 -2 0 9
beta -5 -3 3 -8 -3 -2 5 8
b alpha -4 4 -1 0 7 -4 6 6
beta -9 9 -6 8 5 -2 -9 -8
B a alpha 0 -9 1 -6 2 9 -7 -9
beta -9 -5 -4 -3 -1 8 6 -5
b alpha 0 1 -8 -8 -2 0 -6 -3
beta 2 5 9 -9 5 -6 3 1
df_ex.droplevel([0,1],axis=0)#删除列索引
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big C D
Small c d c d
Other cat dog cat dog cat dog cat dog
Extra
alpha 3 6 -9 -6 -6 -2 0 9
beta -5 -3 3 -8 -3 -2 5 8
alpha -4 4 -1 0 7 -4 6 6
beta -9 9 -6 8 5 -2 -9 -8
alpha 0 -9 1 -6 2 9 -7 -9
beta -9 -5 -4 -3 -1 8 6 -5
alpha 0 1 -8 -8 -2 0 -6 -3
beta 2 5 9 -9 5 -6 3 1

2.索引属性的修改

df_ex.rename_axis(index = {'Uper':'Changed_row'},columns = {'Other':'Changed_Col'})
.head()#修改索引层名字rename_axis
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big C D
Small c d c d
Changed_Col cat dog cat dog cat dog cat dog
Upper Lower Extra
A a alpha 3 6 -9 -6 -6 -2 0 9
beta -5 -3 3 -8 -3 -2 5 8
b alpha -4 4 -1 0 7 -4 6 6
beta -9 9 -6 8 5 -2 -9 -8
B a alpha 0 -9 1 -6 2 9 -7 -9
df_ex.rename(columns={'cat':'not_cat'},level=2).head()#修改索引值rename
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big C D
Small c d c d
Other not_cat dog not_cat dog not_cat dog not_cat dog
Upper Lower Extra
A a alpha 3 6 -9 -6 -6 -2 0 9
beta -5 -3 3 -8 -3 -2 5 8
b alpha -4 4 -1 0 7 -4 6 6
beta -9 9 -6 8 5 -2 -9 -8
B a alpha 0 -9 1 -6 2 9 -7 -9
df_ex.rename(index=lambda x:str.upper(x),level=2).head()#通过函数修改索引值
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead tr th {
    text-align: left;
}

.dataframe thead tr:last-of-type th {
    text-align: right;
}
</style>
Big C D
Small c d c d
Other cat dog cat dog cat dog cat dog
Upper Lower Extra
A a ALPHA 3 6 -9 -6 -6 -2 0 9
BETA -5 -3 3 -8 -3 -2 5 8
b ALPHA -4 4 -1 0 7 -4 6 6
BETA -9 9 -6 8 5 -2 -9 -8
B a ALPHA 0 -9 1 -6 2 9 -7 -9
A = iter(list('abcdefgh'))
df_ex.rename(index=lambda x:next(A),level = 2)
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-80-bc33686b6fdf> in <module>
----> 1 A = iter(list('abcdefgh'))
      2 df_ex.rename(index=lambda x:next(A),level = 2)


TypeError: 'list' object is not callable

3. 索引的设置与重置

df_A = pd.DataFrame({"A":list('aacd'),"B":list('PQRT'),"C":[1,2,3,4]})
df_A
---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-86-2fdf41795bad> in <module>
----> 1 df_A = pd.DataFrame({"A":list('aacd'),"B":list('PQRT'),"C":[1,2,3,4]})
      2 df_A


TypeError: 'list' object is not callable

4. 索引的变形

df_reindex = pd.DataFrame({"Weight":[60,70,80], "Height":[176,180,179]}, index=['1001','1003','1002'])
df_reindex
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Weight Height
1001 60 176
1003 70 180
1002 80 179
df_reindex.reindex(index=['1001','1002','1003','1004'],columns=['Weight','Gender'])
#增加一行的同时,去掉一列并新增一列
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Weight Gender
1001 60.0 NaN
1002 80.0 NaN
1003 70.0 NaN
1004 NaN NaN
df_existed = pd.DataFrame(index = ['1001','1002','1003','1004'],columns=['Weight','Gender'])
df_reindex.reindex_like(df_existed)
# reindex_like与reindex相似 ,其功能是仿照传入的表的索引来进行被调用表索引的变形。
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Weight Gender
1001 60.0 NaN
1002 80.0 NaN
1003 70.0 NaN
1004 NaN NaN

四、索引运算

2. 一般的索引运算

df_A = pd.DataFrame([[0,1],[1,2],[3,4]],
                        index = pd.Index(['a','b','a'],name='id1'))
df_B = pd.DataFrame([[4,5],[2,6],[7,1]],index = pd.Index(['b','b','c'],name='id2'))
id1,id2 = df_A.index.unique(),df_B.index.unique()
id1.intersection(id2)
Index(['b'], dtype='object')
id1.union(id2)
Index(['a', 'b', 'c'], dtype='object')
id1.difference(id2)
Index(['a'], dtype='object')
id1.symmetric_difference(id2)
Index(['a', 'c'], dtype='object')
#若两张表需要做集合运算的列并没有被设置索引,一种办法是先转成索引,运算后再恢复,
#另一种方法是利用 isin 函数
df_a = df_A.reset_index()
df_b = df_B.reset_index()
df_a
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id1 0 1
0 a 0 1
1 b 1 2
2 a 3 4
df_b
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id2 0 1
0 b 4 5
1 b 2 6
2 c 7 1
df_a[df_a.id1.isin(df_b.id2)]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
id1 0 1
1 b 1 2

五、练习

Ex1:公司员工数据集

df = pd.read_csv(r'C:\Users\zhoukaiwei\Desktop\joyful-pandas\data/company.csv')
df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
EmployeeID birthdate_key age city_name department job_title gender
0 1318 1/3/1954 61 Vancouver Executive CEO M
1 1319 1/3/1957 58 Vancouver Executive VP Stores F
2 1320 1/2/1955 60 Vancouver Executive Legal Counsel F
3 1321 1/2/1959 56 Vancouver Executive VP Human Resources M
4 1322 1/9/1958 57 Vancouver Executive VP Finance M

分别只使用 query 和 loc 选出年龄不超过四十岁且工作部门为 Dairy 或 Bakery 的男性。

df.query('(age <= 40)&(department is in ['Diary','Bakery'])&(gender=='M')').head()
  File "<ipython-input-108-aa2a95f52718>", line 1
    df.query('(age <= 40)&(department is in ['Diary','Bakery'])&(gender=='M')').head()
                                                  ^
SyntaxError: invalid syntax

选出员工 ID 号 为奇数所在行的第1、第3和倒数第2列。

df.iloc[(df.EmployeeID%2==1).values,[0,2,-2]].head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
EmployeeID age job_title
1 1319 58 VP Stores
3 1321 56 VP Human Resources
5 1323 53 Exec Assistant, VP Stores
6 1325 51 Exec Assistant, Legal Counsel
8 1329 48 Store Manager

按照以下步骤进行索引操作:

把后三列设为索引后交换内外两层

df.set_index(df.columns[-3:].tolist()).swaplevel(0,2,axis=0).head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
EmployeeID birthdate_key age city_name
gender job_title department
M CEO Executive 1318 1/3/1954 61 Vancouver
F VP Stores Executive 1319 1/3/1957 58 Vancouver
Legal Counsel Executive 1320 1/2/1955 60 Vancouver
M VP Human Resources Executive 1321 1/2/1959 56 Vancouver
VP Finance Executive 1322 1/9/1958 57 Vancouver
df.reset_index(level=1)#恢复中间一层
---------------------------------------------------------------------------

IndexError                                Traceback (most recent call last)

<ipython-input-116-bd39f5c7cad0> in <module>
----> 1 df.reset_index(level=1)


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in reset_index(self, level, drop, inplace, col_level, col_fill)
   4818             if not isinstance(level, (tuple, list)):
   4819                 level = [level]
-> 4820             level = [self.index._get_level_number(lev) for lev in level]
   4821             if len(level) < self.index.nlevels:
   4822                 new_index = self.index.droplevel(level)


D:\Anaconda3\lib\site-packages\pandas\core\frame.py in <listcomp>(.0)
   4818             if not isinstance(level, (tuple, list)):
   4819                 level = [level]
-> 4820             level = [self.index._get_level_number(lev) for lev in level]
   4821             if len(level) < self.index.nlevels:
   4822                 new_index = self.index.droplevel(level)


D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _get_level_number(self, level)
   1413 
   1414     def _get_level_number(self, level) -> int:
-> 1415         self._validate_index_level(level)
   1416         return 0
   1417 


D:\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _validate_index_level(self, level)
   1405             elif level > 0:
   1406                 raise IndexError(
-> 1407                     f"Too many levels: Index has only 1 level, not {level + 1}"
   1408                 )
   1409         elif level != self.name:


IndexError: Too many levels: Index has only 1 level, not 2

修改外层索引名为 Gender

df.rename_axis(index={'gender':'Gender'}).head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
EmployeeID birthdate_key age city_name department job_title gender
0 1318 1/3/1954 61 Vancouver Executive CEO M
1 1319 1/3/1957 58 Vancouver Executive VP Stores F
2 1320 1/2/1955 60 Vancouver Executive Legal Counsel F
3 1321 1/2/1959 56 Vancouver Executive VP Human Resources M
4 1322 1/9/1958 57 Vancouver Executive VP Finance M

用下划线合并两层行索引

后面题目继续再做后面会更新到文档里面