numpy & pandas

2019-07-21 约 5363 字预计阅读 11 分钟收录于机器学习

numpy

查看矩阵属性

import numpy as np

array = np.array([
    [1,2,3],
    [4,5,6]
])
#打印矩阵
print(array)
#矩阵维度: ndim 行数与列数: shape 元素个数: size 元素类型: dtype
print('维度:',array.ndim,'形状:',array.shape,'大小:',array.size,'类型:', array.dtype)

##########输出结果###########
[[1 2 3]
 [4 5 6]]
维度: 2 形状: (2, 3) 大小: 6 类型: int32

创建矩阵

import numpy as np

a = np.array([
    [1,2,3],
    [4,5,6]
],dtype=np.int64)
print(a,a.dtype)
#全为0
b = np.zeros((2,3))
print(b)
#全为1
c = np.ones((2,3))
print(c)
#空矩阵
d = np.empty((2,3))
print(d)
print(b.dtype, c.dtype, d.dtype)

#生成左闭右开序列，第三位为步进值
e = np.arange(10,20,2)
print(e)

#reshape 重新定义矩阵的行和列
f = np.arange(12).reshape((3,4))
print(f)
#linspace
g = np.linspace(1,10,5) #在(1,5)范围内等间距取5个数
print(g)
##############输出结果#############
[[1 2 3]
 [4 5 6]] int64
[[0. 0. 0.]
 [0. 0. 0.]]
[[1. 1. 1.]
 [1. 1. 1.]]
[[1. 1. 1.]
 [1. 1. 1.]]
float64 float64 float64
[10 12 14 16 18]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[ 1.    3.25  5.5   7.75 10.  ]

基础运算

import numpy as np

a = np.array([10,20,30,40])
b = np.arange(4)
print(a,b)
#加减法
c = a-b
print(c)
#次幂
d = b**3   #三次幂
print(d)
#三角函数
e = np.sin(a)  #对a中的每一个元素求sin值
print(e)
#判断矩阵中的元素是否小于某一特定值
print(b<=2)  #判断b中的元素是否小于等于2

a = np.array([
    [1,1],
    [0,1]
])
b = np.arange(4).reshape((2,2))
print(a)
print(b)
#矩阵乘法
c = np.dot(a,b)
#或 c = a.dot(b)
print(c)
#普通的对应相乘
d = a*b
print(d)
#转置
print(a.T)

#生成0-1之间的随机数
a = np.random.random((2,4))
#sum 求和 min 最小者 max 最大值
print(a,np.sum(a),np.min(a),np.max(a))

#axis=0 以横轴为中心，对两侧数据进行运算
#axis=1 以纵轴为中心，对两侧数据进行运算
print(a)
print(np.sum(a,axis=1)) #按行求和
print(np.min(a,axis=0)) #每一列的最小值
print(np.max(a,axis=1)) #每一行的最大值

#最大值索引，最小值索引
a = np.arange(2,14).reshape(3,4)
print(a)
print(np.argmax(a))
print(np.argmin(a))
#平均值
print(np.mean(a)) #或print(a.mean())
#中位数
print(np.median(a))
#累加值cumsum
print(np.cumsum(a)) #[第一个 第一个+第二个 第一个+第二个+第三个 ...]
#后一个与前一个相差，结果列数会减一
print(np.diff(a))
#逐行排序，默认从小到大
print(np.sort(a))
#clip
print(np.clip(a,5,10)) #将矩阵中小于5的数变为5，大于10的数变为10
#######输出结果#########
[10 20 30 40] [0 1 2 3]
[10 19 28 37]
[ 0  1  8 27]
[-0.54402111  0.91294525 -0.98803162  0.74511316]
[ True  True  True False]  #判断矩阵中的元素是否小于某一特定值
[[1 1]
 [0 1]]
[[0 1]
 [2 3]]
#矩阵乘法
[[2 4]
 [2 3]]
#普通的对应相乘
[[0 1]
 [0 3]]
#转置
[[1 0]
 [1 1]]
[[0.77735274 0.5491183  0.22564502 0.68081005]
 [0.96350842 0.579199   0.32895422 0.60034064]] 4.704928386200018 0.2256450223259967 0.9635084205378365
[[0.77735274 0.5491183  0.22564502 0.68081005]
 [0.96350842 0.579199   0.32895422 0.60034064]]
[2.23292612 2.47200227]
[0.77735274 0.5491183  0.22564502 0.60034064]
[0.77735274 0.96350842]
[[ 2  3  4  5]
 [ 6  7  8  9]
 [10 11 12 13]]
11  #最大值索引，最小值索引
0
7.5
7.5
[ 2  5  9 14 20 27 35 44 54 65 77 90]
[[1 1 1]
 [1 1 1]
 [1 1 1]]
#逐行排序，默认从小到大
[[ 2  3  4  5]  
 [ 6  7  8  9]
 [10 11 12 13]]
#clip
[[ 5  5  5  5]
 [ 6  7  8  9]
 [10 10 10 10]]

索引

import numpy as np

A = np.arange(3,15)
#一维数组索引
print(A[2])
A = A.reshape(3,4)
#二维数组索引
print(A)
print(A[2])
print(A[2][2])

#切片
print(A)
print(A[1,1])
print(A[1,:])
print(A[1,1:3]) #1:3为左闭右开区间

#迭代
for row in A:     #默认迭代行
    print(row)
#迭代列
for colume in A.T:  #先转置，在迭代行就得到原始矩阵的列了
    print(colume)
#迭代每一项
for item in A.flat:  #A.flat将A变为一维迭代
    print(item)
#A.flatten()，将二维矩阵变为一维矩阵，与A.flat不同的是，A.flat返回值为迭代器，而A.flatten()返回值为一维矩阵
print(A.flatten())
###########输出结果#############
5
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]
[11 12 13 14]
13
#切片
[[ 3  4  5  6]
 [ 7  8  9 10]
 [11 12 13 14]]
8
[ 7  8  9 10]
[8 9]

[3 4 5 6]
[ 7  8  9 10]
[11 12 13 14]
#迭代列
[ 3  7 11]
[ 4  8 12]
[ 5  9 13]
[ 6 10 14]
#迭代每一项
3
4
5
6
7
8
9
10
11
12
13
14
[ 3  4  5  6  7  8  9 10 11 12 13 14]

array合并

import numpy as np

A = np.array([
    [1,1,1],
    [1,1,1]
])
B = np.array([
    [2,2,2],
    [2,2,2]
])

#垂直(上下)合并,可同时合并多个
print(np.vstack((A,B,A)))
#水平(左右)合并
print(np.hstack((A,B,A)))

#多个矩阵纵向或横向合并concatenate
print(np.concatenate((A,B,A,B),axis=0))  #上下合并
###########输出结果##########
[[1 1 1]
 [1 1 1]
 [2 2 2]
 [2 2 2]
 [1 1 1]
 [1 1 1]]

[[1 1 1 2 2 2 1 1 1]
 [1 1 1 2 2 2 1 1 1]]

[[1 1 1]
 [1 1 1]
 [2 2 2]
 [2 2 2]
 [1 1 1]
 [1 1 1]
 [2 2 2]
 [2 2 2]]

array分割

import numpy as np

A = np.arange(12).reshape(3,4)
print(A)

#等比例分割
#axis=0 以横轴为中心，对两侧数据进行运算
#axis=1 以纵轴为中心，对两侧数据进行运算
print(np.split(A,2,axis=1)) #横向分割为两块
print(np.split(A,3,axis=0)) #纵向分割为三块

print(np.vsplit(A,3)) #纵向分割为三块
print(np.hsplit(A,2)) #横向分割为两块

#不等比分割，例如四行要纵向分三块的时候用split等比例分割是不行的，只能不等比例地分为三块
print(np.array_split(A,3,axis=1)) #横向分割为三块
#########输出结果############
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
#横向分割为两块
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]
#纵向分割为三块
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
#纵向分割为三块
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8,  9, 10, 11]])]
#横向分割为两块
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]
#横向分割为三块，不等比
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2],
       [ 6],
       [10]]), array([[ 3],
       [ 7],
       [11]])]

copy & deep copy

import numpy as np

a = np.arange(4)
print(a)
#copy 这种赋值方式相当于C++中的引用，相当于起了个别名
b = a
c = b

a[0]  = 11
print(a)
print(c)
print(b is a) #判断b与a是否相同
print(b == a) #判断b与a中的每个元素是否对应相等

# deep copy
a = np.arange(4)
b = a.copy()
a[0] = 11
print(a)
print(b)
#########输出结果##########
[0 1 2 3]
[11  1  2  3]
[11  1  2  3]
#判断b与a是否相同
True
#判断b与a中的每个元素是否对应相等
[ True  True  True  True]
[11  1  2  3]
[0 1 2 3]

pandas

pandas中的数据格式为DataFrame，主要以’行’作为一个单位，规模大多是向下增长的；一行的数据为一个Series

矩阵的基本操作

import pandas as pd
import numpy as np

s = pd.Series([1,2,3,np.nan,44,1])
print(s)
dates = pd.date_range('20190101',periods=6)
print(dates)
#设置行索引为index，列索引为columns，若不设置的话，默认为0 1 2 ...
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)

#生成DataFrame主要有两种方式，一种是利用Numpy来导入，另一种是用字典方式来构造，每一行代表矩阵的一列，如以下
df2 = pd.DataFrame({
    'A':1,
    'B':pd.Timestamp('20190101'),
    'C':pd.Series(1,index=list(range(4)),dtype='float32'),
    'D':np.array([3]*4,dtype='int32'),
    'E':pd.Categorical(["test","train","test","train"]),
    'F':'foo'
})
print(df2)

#产看数据格式2
print(df2.dtypes)
#查看行索引
print(df2.index)
#查看列索引
print(df2.columns)
#查看数据值
print(df2.values)

#描述 产看数值型数据的平均值、最大小值等属性
print(df2.describe())
#按索引排序
df3 = df2.sort_index(axis=1,ascending=False)  #将列索引倒叙显示
print(df3)
#按值排序
df4 = df2.sort_values(by='E')
print(df4)
#######输出结果##########
0     1.0
1     2.0
2     3.0
3     NaN
4    44.0
5     1.0
dtype: float64

DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06'],
              dtype='datetime64[ns]', freq='D')
#利用numpy来构造
                   a         b         c         d
2019-01-01  0.920367 -0.561371  1.249657 -0.091844
2019-01-02  1.888219 -0.367496  1.048375  0.967817
2019-01-03 -0.557569  0.312368 -0.060077 -0.568380
2019-01-04  0.088714 -0.922488 -0.987815  0.355981
2019-01-05 -0.375266 -0.819393  0.570262 -0.346604
2019-01-06  0.665096  0.097616  0.894537  0.326664
#用字典方式来构造
   A          B    C  D      E    F
0  1 2019-01-01  1.0  3   test  foo
1  1 2019-01-01  1.0  3  train  foo
2  1 2019-01-01  1.0  3   test  foo
3  1 2019-01-01  1.0  3  train  foo
A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
#查看行索引
Int64Index([0, 1, 2, 3], dtype='int64')
#查看列索引
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

[[1 Timestamp('2019-01-01 00:00:00') 1.0 3 'test' 'foo']
 [1 Timestamp('2019-01-01 00:00:00') 1.0 3 'train' 'foo']
 [1 Timestamp('2019-01-01 00:00:00') 1.0 3 'test' 'foo']
 [1 Timestamp('2019-01-01 00:00:00') 1.0 3 'train' 'foo']]
#描述 产看数值型数据的平均值、最大小值等属性
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
#按索引排序
     F      E  D    C          B  A
0  foo   test  3  1.0 2019-01-01  1
1  foo  train  3  1.0 2019-01-01  1
2  foo   test  3  1.0 2019-01-01  1
3  foo  train  3  1.0 2019-01-01  1
#按值排序
   A          B    C  D      E    F
0  1 2019-01-01  1.0  3   test  foo
2  1 2019-01-01  1.0  3   test  foo
1  1 2019-01-01  1.0  3  train  foo
3  1 2019-01-01  1.0  3  train  foo

pandas选择器

import numpy as np
import pandas as pd

dates = pd.date_range('20190101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)

#通过索引来选择
print(df['A'],'\n或者:',df.A) #此时不能用df[0]了，因为其索引已经改变了，从原来默认的0 1 2变为了A B C ...
#通过切片来选择
print(df[1:3])  #[1,3)代表二三行
print(df['20190102':'20190104'])

#select by label:loc (location) 通过标签索引来选
print(df.loc['20190102'])
#综上，选择列可以用索引来选如df['A'],而选择行则用loc来选，如df.loc['20190102'],而用df['20190102']则是错的
print(df.loc[:,['A','B']]) #打印出A,B两列

#select by position:iloc (index location) 通过行列坐标来选
print(df.iloc[3,1])  #输出第4行第2列
print(df.iloc[1:3,2:3]) #输出第2和第3行的第3列

#Boolean indexing #判断选择
print([df[df.A > 8]]) #将'A'列中大于8的行打印出来
#######结果输出########
             A   B   C   D
2019-01-01   0   1   2   3
2019-01-02   4   5   6   7
2019-01-03   8   9  10  11
2019-01-04  12  13  14  15
2019-01-05  16  17  18  19
2019-01-06  20  21  22  23
2019-01-01     0
2019-01-02     4
2019-01-03     8
2019-01-04    12
2019-01-05    16
2019-01-06    20
Freq: D, Name: A, dtype: int32 
或者: 2019-01-01     0
2019-01-02     4
2019-01-03     8
2019-01-04    12
2019-01-05    16
2019-01-06    20
Freq: D, Name: A, dtype: int32
            A  B   C   D
2019-01-02  4  5   6   7
2019-01-03  8  9  10  11
             A   B   C   D
2019-01-02   4   5   6   7
2019-01-03   8   9  10  11
2019-01-04  12  13  14  15
#select by label:loc (location) 通过标签索引来选
A    4
B    5
C    6
D    7
Name: 2019-01-02 00:00:00, dtype: int32
#print(df.loc[:,['A','B']]) 打印出A,B两列
             A   B
2019-01-01   0   1
2019-01-02   4   5
2019-01-03   8   9
2019-01-04  12  13
2019-01-05  16  17
2019-01-06  20  21
#通过行列坐标来选
13
#
             C
2019-01-02   6
2019-01-03  10
#判断选择
[             A   B   C   D
2019-01-04  12  13  14  15
2019-01-05  16  17  18  19
2019-01-06  20  21  22  23]

pandas 设置值

import numpy as np
import pandas as pd

dates = pd.date_range('20190101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)

#通过坐标选择后赋值修改
df.iloc[2,2] = 223 #修改第3行第3列的值为223
print(df)
#通过索引标签
df.loc['20190102','B'] = 567
print(df)
#通过判断条件
df.A[df.A>=8] = 0
print(df)
#添加新的一列
df['F'] = [1,2,3,4,5,6]  #or np.nan
print(df)

######输出结果########
             A   B   C   D
2019-01-01   0   1   2   3
2019-01-02   4   5   6   7
2019-01-03   8   9  10  11
2019-01-04  12  13  14  15
2019-01-05  16  17  18  19
2019-01-06  20  21  22  23
#通过坐标选择后赋值修改
             A   B    C   D
2019-01-01   0   1    2   3
2019-01-02   4   5    6   7
2019-01-03   8   9  223  11
2019-01-04  12  13   14  15
2019-01-05  16  17   18  19
2019-01-06  20  21   22  23
#通过索引标签
             A    B    C   D
2019-01-01   0    1    2   3
2019-01-02   4  567    6   7
2019-01-03   8    9  223  11
2019-01-04  12   13   14  15
2019-01-05  16   17   18  19
2019-01-06  20   21   22  23
#通过判断条件
            A    B    C   D
2019-01-01  0    1    2   3
2019-01-02  4  567    6   7
2019-01-03  0    9  223  11
2019-01-04  0   13   14  15
2019-01-05  0   17   18  19
2019-01-06  0   21   22  23
#添加新的一列
            A    B    C   D  F
2019-01-01  0    1    2   3  1
2019-01-02  4  567    6   7  2
2019-01-03  0    9  223  11  3
2019-01-04  0   13   14  15  4
2019-01-05  0   17   18  19  5
2019-01-06  0   21   22  23  6

pandas处理丢失数据

import numpy as np
import pandas as pd

dates = pd.date_range('20190101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)

df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)

#丢弃 dropna
print(df.dropna(axis=0,how='any')) #axis=0时丢掉行，how='any'时只要一行中有一个nan就丢掉，how='all'时表示当一行中全为nan时才丢掉

print(df)
#填充 fillna
print(df.fillna(value=0))

#判断是否是nan
print(df.isna()) #直接打印出真假表格
print(np.any(df.isnull() == True)) #打印出真假值，判断数据中是否有nan

#######输出结果##########
             A   B   C   D
2019-01-01   0   1   2   3
2019-01-02   4   5   6   7
2019-01-03   8   9  10  11
2019-01-04  12  13  14  15
2019-01-05  16  17  18  19
2019-01-06  20  21  22  23
             A     B     C   D
2019-01-01   0   NaN   2.0   3
2019-01-02   4   5.0   NaN   7
2019-01-03   8   9.0  10.0  11
2019-01-04  12  13.0  14.0  15
2019-01-05  16  17.0  18.0  19
2019-01-06  20  21.0  22.0  23
#丢弃 dropna
             A     B     C   D
2019-01-03   8   9.0  10.0  11
2019-01-04  12  13.0  14.0  15
2019-01-05  16  17.0  18.0  19
2019-01-06  20  21.0  22.0  23
             A     B     C   D
2019-01-01   0   NaN   2.0   3
2019-01-02   4   5.0   NaN   7
2019-01-03   8   9.0  10.0  11
2019-01-04  12  13.0  14.0  15
2019-01-05  16  17.0  18.0  19
2019-01-06  20  21.0  22.0  23
#填充 fillna
             A     B     C   D
2019-01-01   0   0.0   2.0   3
2019-01-02   4   5.0   0.0   7
2019-01-03   8   9.0  10.0  11
2019-01-04  12  13.0  14.0  15
2019-01-05  16  17.0  18.0  19
2019-01-06  20  21.0  22.0  23
#判断是否是nan
                A      B      C      D
2019-01-01  False   True  False  False
2019-01-02  False  False   True  False
2019-01-03  False  False  False  False
2019-01-04  False  False  False  False
2019-01-05  False  False  False  False
2019-01-06  False  False  False  False
#
True

pandas导入导出数据

import pandas as pd
#读取
data = pd.read_csv('C:\\Users\\17806\\OneDrive\\桌面\\Study\\MyProgram\\student.csv')
print(data)
#存储
data.to_pickle('C:\\Users\\17806\\OneDrive\\桌面\\Study\\MyProgram\\student.pickle')
#####输出结果#######
    Student ID  name   age  gender
0         1100  Kelly   22  Female
1         1101    Clo   21  Female
2         1102  Tilly   22  Female
3         1103   Tony   24    Male
4         1104  David   20    Male
5         1105  Catty   22  Female
6         1106      M    3  Female

pandas连接FataFrame concatenating

import pandas as pd
import numpy as np

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)

#concatenating
res = pd.concat([df1,df2,df3],axis=0)
print(res)
#忽略已有的横向标签索引，重新排序
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print(res)

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','f','e'],index=[2,3,4])
print(df1)
print(df2)

#join,['inner','outer']  join默认为outer
res = pd.concat([df1,df2],join='outer')  #列索引相同的归为一列，不同的则为nan
print(res)
res = pd.concat([df1,df2],join='inner')  #列索引相同的归为一列，不同的剔除
print(res)

#reindex
res = pd.concat([df1,df2],axis=1)
print(res)
res = pd.concat([df1,df2],axis=1).reindex(df1.index) #将横向索引按照df1重新排序
print(res)

#append  向下追加数据，以DataFrame为单元
res = df1.append(df2,ignore_index=True)
print(res)
res = df1.append([df2,df3],ignore_index=True)
print(res)
#追加一行数据 Series
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
res = df1.append(s1,ignore_index=True)
print(res)
#########输出结果#######
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
#concatenating
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
#ignore_index=True
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
     b    c    f    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
#列索引相同的归为一列，不同的则为nan
     a    b    c    d    e    f
1  0.0  0.0  0.0  0.0  NaN  NaN
2  0.0  0.0  0.0  0.0  NaN  NaN
3  0.0  0.0  0.0  0.0  NaN  NaN
2  NaN  1.0  1.0  NaN  1.0  1.0
3  NaN  1.0  1.0  NaN  1.0  1.0
4  NaN  1.0  1.0  NaN  1.0  1.0
#列索引相同的归为一列，不同的则剔除
     b    c
1  0.0  0.0
2  0.0  0.0
3  0.0  0.0
2  1.0  1.0
3  1.0  1.0
4  1.0  1.0
     a    b    c    d    b    c    f    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
#将横向索引按照df1重新排序
     a    b    c    d    b    c    f    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
#append  向下追加数据，以DataFrame为单元
     a    b    c    d    e    f
0  0.0  0.0  0.0  0.0  NaN  NaN
1  0.0  0.0  0.0  0.0  NaN  NaN
2  0.0  0.0  0.0  0.0  NaN  NaN
3  NaN  1.0  1.0  NaN  1.0  1.0
4  NaN  1.0  1.0  NaN  1.0  1.0
5  NaN  1.0  1.0  NaN  1.0  1.0

     a    b    c    d    e    f
0  0.0  0.0  0.0  0.0  NaN  NaN
1  0.0  0.0  0.0  0.0  NaN  NaN
2  0.0  0.0  0.0  0.0  NaN  NaN
3  NaN  1.0  1.0  NaN  1.0  1.0
4  NaN  1.0  1.0  NaN  1.0  1.0
5  NaN  1.0  1.0  NaN  1.0  1.0
6  2.0  2.0  2.0  2.0  NaN  NaN
7  2.0  2.0  2.0  2.0  NaN  NaN
8  2.0  2.0  2.0  2.0  NaN  NaN
#追加一行数据 Series
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0

pandas合并DataFrame merge

import pandas as pd
import numpy as np

left = pd.DataFrame({
    'key':['K0','K1','K2','K3'],
    'A':['A0','A1','A2','A3'],
    'B':['B0','B1','B2','B3']
})
right = pd.DataFrame({
    'key':['K0','K1','K2','K3'],
    'C':['C0','C1','C2','C3'],
    'D':['D0','D1','D2','D3']
})
print(left)
print(right)

#merge how默认为inner，即没有的就剔除
res = pd.merge(left,right,on='key')  #on表示按照哪一个列索引来合并
print(res)

left = pd.DataFrame({
    'key1':['K0','K0','K1','K2'],
    'key2':['K0','K1','K0','K1'],
    'A':['A0','A1','A2','A3'],
    'B':['B0','B1','B2','B3']
})
right = pd.DataFrame({
    'key1':['K0','K1','K1','K2'],
    'key2':['K0','K0','K0','K0'],
    'C':['C0','C1','C2','C3'],
    'D':['D0','D1','D2','D3']
})
print(left)
print(right)
#按照两个key索引来合并
res = pd.merge(left,right,on=['key1','key2'])  
print(res)
########输出结果#########
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3
#merge 
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3
#按照两个key索引来合并
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2

pandas画图 plot

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Serise类数据
data1 = pd.Series(np.random.randn(1000),index=np.arange(1000))
data1 = data1.cumsum()
print(data1)
data1.plot()

#DataFrame类数据
data2 = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
data2 = data2.cumsum()
data2.plot()
print(data2)

#图形显示方式：bar hist box kde area scatter hexbin pie
#散点图 scatter
data3 = data2.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data4 = data2.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2', ax=data3)

plt.show()
######输出结果#########
0      -0.931262
1      -0.459424
2      -0.164038
3      -0.196871
4      -0.902499
         ...
995    -7.521910
996    -7.621425
997   -10.207572
998   -10.859286
999   -11.072143
Length: 1000, dtype: float64

             A          B          C         D
0    -0.944536   1.715489  -0.197838  0.104654
1    -0.334956   1.261398  -1.197463 -0.645474
2     1.009486   0.777278  -1.625246 -0.436689
3     0.689034   1.317292  -1.726993 -0.038378
4     1.096938   4.138512  -2.609029 -0.158040
..         ...        ...        ...       ...
995 -17.457821  29.327120 -36.986072  2.700634
996 -16.550651  29.218301 -35.862315  4.810438
997 -17.055353  29.476536 -35.520861  4.162424
998 -16.429215  29.605428 -36.791283  1.169698
999 -17.281577  29.707916 -35.212506  1.860799
[1000 rows x 4 columns]

Serise类数据
DataFrame类数据
散点图 scatter

目录

numpy & pandas

numpy

查看矩阵属性

创建矩阵

基础运算

索引

array合并

array分割

copy & deep copy

pandas

矩阵的基本操作

pandas选择器

pandas 设置值

pandas处理丢失数据

pandas导入导出数据

pandas连接FataFrame concatenating

pandas合并DataFrame merge

pandas画图 plot