numpy & pandas
目录
numpy
查看矩阵属性
import numpy as np
array = np.array([
[1,2,3],
[4,5,6]
])
#打印矩阵
print(array)
#矩阵维度: ndim 行数与列数: shape 元素个数: size 元素类型: dtype
print('维度:',array.ndim,'形状:',array.shape,'大小:',array.size,'类型:', array.dtype)
##########输出结果###########
[[1 2 3]
[4 5 6]]
维度: 2 形状: (2, 3) 大小: 6 类型: int32
创建矩阵
import numpy as np
a = np.array([
[1,2,3],
[4,5,6]
],dtype=np.int64)
print(a,a.dtype)
#全为0
b = np.zeros((2,3))
print(b)
#全为1
c = np.ones((2,3))
print(c)
#空矩阵
d = np.empty((2,3))
print(d)
print(b.dtype, c.dtype, d.dtype)
#生成左闭右开序列,第三位为步进值
e = np.arange(10,20,2)
print(e)
#reshape 重新定义矩阵的行和列
f = np.arange(12).reshape((3,4))
print(f)
#linspace
g = np.linspace(1,10,5) #在(1,5)范围内等间距取5个数
print(g)
##############输出结果#############
[[1 2 3]
[4 5 6]] int64
[[0. 0. 0.]
[0. 0. 0.]]
[[1. 1. 1.]
[1. 1. 1.]]
[[1. 1. 1.]
[1. 1. 1.]]
float64 float64 float64
[10 12 14 16 18]
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
[ 1. 3.25 5.5 7.75 10. ]
基础运算
import numpy as np
a = np.array([10,20,30,40])
b = np.arange(4)
print(a,b)
#加减法
c = a-b
print(c)
#次幂
d = b**3 #三次幂
print(d)
#三角函数
e = np.sin(a) #对a中的每一个元素求sin值
print(e)
#判断矩阵中的元素是否小于某一特定值
print(b<=2) #判断b中的元素是否小于等于2
a = np.array([
[1,1],
[0,1]
])
b = np.arange(4).reshape((2,2))
print(a)
print(b)
#矩阵乘法
c = np.dot(a,b)
#或 c = a.dot(b)
print(c)
#普通的对应相乘
d = a*b
print(d)
#转置
print(a.T)
#生成0-1之间的随机数
a = np.random.random((2,4))
#sum 求和 min 最小者 max 最大值
print(a,np.sum(a),np.min(a),np.max(a))
#axis=0 以横轴为中心,对两侧数据进行运算
#axis=1 以纵轴为中心,对两侧数据进行运算
print(a)
print(np.sum(a,axis=1)) #按行求和
print(np.min(a,axis=0)) #每一列的最小值
print(np.max(a,axis=1)) #每一行的最大值
#最大值索引,最小值索引
a = np.arange(2,14).reshape(3,4)
print(a)
print(np.argmax(a))
print(np.argmin(a))
#平均值
print(np.mean(a)) #或print(a.mean())
#中位数
print(np.median(a))
#累加值cumsum
print(np.cumsum(a)) #[第一个 第一个+第二个 第一个+第二个+第三个 ...]
#后一个与前一个相差,结果列数会减一
print(np.diff(a))
#逐行排序,默认从小到大
print(np.sort(a))
#clip
print(np.clip(a,5,10)) #将矩阵中小于5的数变为5,大于10的数变为10
#######输出结果#########
[10 20 30 40] [0 1 2 3]
[10 19 28 37]
[ 0 1 8 27]
[-0.54402111 0.91294525 -0.98803162 0.74511316]
[ True True True False] #判断矩阵中的元素是否小于某一特定值
[[1 1]
[0 1]]
[[0 1]
[2 3]]
#矩阵乘法
[[2 4]
[2 3]]
#普通的对应相乘
[[0 1]
[0 3]]
#转置
[[1 0]
[1 1]]
[[0.77735274 0.5491183 0.22564502 0.68081005]
[0.96350842 0.579199 0.32895422 0.60034064]] 4.704928386200018 0.2256450223259967 0.9635084205378365
[[0.77735274 0.5491183 0.22564502 0.68081005]
[0.96350842 0.579199 0.32895422 0.60034064]]
[2.23292612 2.47200227]
[0.77735274 0.5491183 0.22564502 0.60034064]
[0.77735274 0.96350842]
[[ 2 3 4 5]
[ 6 7 8 9]
[10 11 12 13]]
11 #最大值索引,最小值索引
0
7.5
7.5
[ 2 5 9 14 20 27 35 44 54 65 77 90]
[[1 1 1]
[1 1 1]
[1 1 1]]
#逐行排序,默认从小到大
[[ 2 3 4 5]
[ 6 7 8 9]
[10 11 12 13]]
#clip
[[ 5 5 5 5]
[ 6 7 8 9]
[10 10 10 10]]
索引
import numpy as np
A = np.arange(3,15)
#一维数组索引
print(A[2])
A = A.reshape(3,4)
#二维数组索引
print(A)
print(A[2])
print(A[2][2])
#切片
print(A)
print(A[1,1])
print(A[1,:])
print(A[1,1:3]) #1:3为左闭右开区间
#迭代
for row in A: #默认迭代行
print(row)
#迭代列
for colume in A.T: #先转置,在迭代行就得到原始矩阵的列了
print(colume)
#迭代每一项
for item in A.flat: #A.flat将A变为一维迭代
print(item)
#A.flatten(),将二维矩阵变为一维矩阵,与A.flat不同的是,A.flat返回值为迭代器,而A.flatten()返回值为一维矩阵
print(A.flatten())
###########输出结果#############
5
[[ 3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]]
[11 12 13 14]
13
#切片
[[ 3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]]
8
[ 7 8 9 10]
[8 9]
[3 4 5 6]
[ 7 8 9 10]
[11 12 13 14]
#迭代列
[ 3 7 11]
[ 4 8 12]
[ 5 9 13]
[ 6 10 14]
#迭代每一项
3
4
5
6
7
8
9
10
11
12
13
14
[ 3 4 5 6 7 8 9 10 11 12 13 14]
array合并
import numpy as np
A = np.array([
[1,1,1],
[1,1,1]
])
B = np.array([
[2,2,2],
[2,2,2]
])
#垂直(上下)合并,可同时合并多个
print(np.vstack((A,B,A)))
#水平(左右)合并
print(np.hstack((A,B,A)))
#多个矩阵纵向或横向合并concatenate
print(np.concatenate((A,B,A,B),axis=0)) #上下合并
###########输出结果##########
[[1 1 1]
[1 1 1]
[2 2 2]
[2 2 2]
[1 1 1]
[1 1 1]]
[[1 1 1 2 2 2 1 1 1]
[1 1 1 2 2 2 1 1 1]]
[[1 1 1]
[1 1 1]
[2 2 2]
[2 2 2]
[1 1 1]
[1 1 1]
[2 2 2]
[2 2 2]]
array分割
import numpy as np
A = np.arange(12).reshape(3,4)
print(A)
#等比例分割
#axis=0 以横轴为中心,对两侧数据进行运算
#axis=1 以纵轴为中心,对两侧数据进行运算
print(np.split(A,2,axis=1)) #横向分割为两块
print(np.split(A,3,axis=0)) #纵向分割为三块
print(np.vsplit(A,3)) #纵向分割为三块
print(np.hsplit(A,2)) #横向分割为两块
#不等比分割,例如四行要纵向分三块的时候用split等比例分割是不行的,只能不等比例地分为三块
print(np.array_split(A,3,axis=1)) #横向分割为三块
#########输出结果############
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
#横向分割为两块
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
#纵向分割为三块
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
#纵向分割为三块
[array([[0, 1, 2, 3]]), array([[4, 5, 6, 7]]), array([[ 8, 9, 10, 11]])]
#横向分割为两块
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
#横向分割为三块,不等比
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2],
[ 6],
[10]]), array([[ 3],
[ 7],
[11]])]
copy & deep copy
import numpy as np
a = np.arange(4)
print(a)
#copy 这种赋值方式相当于C++中的引用,相当于起了个别名
b = a
c = b
a[0] = 11
print(a)
print(c)
print(b is a) #判断b与a是否相同
print(b == a) #判断b与a中的每个元素是否对应相等
# deep copy
a = np.arange(4)
b = a.copy()
a[0] = 11
print(a)
print(b)
#########输出结果##########
[0 1 2 3]
[11 1 2 3]
[11 1 2 3]
#判断b与a是否相同
True
#判断b与a中的每个元素是否对应相等
[ True True True True]
[11 1 2 3]
[0 1 2 3]
pandas
pandas中的数据格式为DataFrame,主要以’行’作为一个单位,规模大多是向下增长的;一行的数据为一个Series
矩阵的基本操作
import pandas as pd
import numpy as np
s = pd.Series([1,2,3,np.nan,44,1])
print(s)
dates = pd.date_range('20190101',periods=6)
print(dates)
#设置行索引为index,列索引为columns,若不设置的话,默认为0 1 2 ...
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
#生成DataFrame主要有两种方式,一种是利用Numpy来导入,另一种是用字典方式来构造,每一行代表矩阵的一列,如以下
df2 = pd.DataFrame({
'A':1,
'B':pd.Timestamp('20190101'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'
})
print(df2)
#产看数据格式2
print(df2.dtypes)
#查看行索引
print(df2.index)
#查看列索引
print(df2.columns)
#查看数据值
print(df2.values)
#描述 产看数值型数据的平均值、最大小值等属性
print(df2.describe())
#按索引排序
df3 = df2.sort_index(axis=1,ascending=False) #将列索引倒叙显示
print(df3)
#按值排序
df4 = df2.sort_values(by='E')
print(df4)
#######输出结果##########
0 1.0
1 2.0
2 3.0
3 NaN
4 44.0
5 1.0
dtype: float64
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06'],
dtype='datetime64[ns]', freq='D')
#利用numpy来构造
a b c d
2019-01-01 0.920367 -0.561371 1.249657 -0.091844
2019-01-02 1.888219 -0.367496 1.048375 0.967817
2019-01-03 -0.557569 0.312368 -0.060077 -0.568380
2019-01-04 0.088714 -0.922488 -0.987815 0.355981
2019-01-05 -0.375266 -0.819393 0.570262 -0.346604
2019-01-06 0.665096 0.097616 0.894537 0.326664
#用字典方式来构造
A B C D E F
0 1 2019-01-01 1.0 3 test foo
1 1 2019-01-01 1.0 3 train foo
2 1 2019-01-01 1.0 3 test foo
3 1 2019-01-01 1.0 3 train foo
A int64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
#查看行索引
Int64Index([0, 1, 2, 3], dtype='int64')
#查看列索引
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
[[1 Timestamp('2019-01-01 00:00:00') 1.0 3 'test' 'foo']
[1 Timestamp('2019-01-01 00:00:00') 1.0 3 'train' 'foo']
[1 Timestamp('2019-01-01 00:00:00') 1.0 3 'test' 'foo']
[1 Timestamp('2019-01-01 00:00:00') 1.0 3 'train' 'foo']]
#描述 产看数值型数据的平均值、最大小值等属性
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
#按索引排序
F E D C B A
0 foo test 3 1.0 2019-01-01 1
1 foo train 3 1.0 2019-01-01 1
2 foo test 3 1.0 2019-01-01 1
3 foo train 3 1.0 2019-01-01 1
#按值排序
A B C D E F
0 1 2019-01-01 1.0 3 test foo
2 1 2019-01-01 1.0 3 test foo
1 1 2019-01-01 1.0 3 train foo
3 1 2019-01-01 1.0 3 train foo
pandas选择器
import numpy as np
import pandas as pd
dates = pd.date_range('20190101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
#通过索引来选择
print(df['A'],'\n或者:',df.A) #此时不能用df[0]了,因为其索引已经改变了,从原来默认的0 1 2变为了A B C ...
#通过切片来选择
print(df[1:3]) #[1,3)代表二三行
print(df['20190102':'20190104'])
#select by label:loc (location) 通过标签索引来选
print(df.loc['20190102'])
#综上,选择列可以用索引来选如df['A'],而选择行则用loc来选,如df.loc['20190102'],而用df['20190102']则是错的
print(df.loc[:,['A','B']]) #打印出A,B两列
#select by position:iloc (index location) 通过行列坐标来选
print(df.iloc[3,1]) #输出第4行第2列
print(df.iloc[1:3,2:3]) #输出第2和第3行的第3列
#Boolean indexing #判断选择
print([df[df.A > 8]]) #将'A'列中大于8的行打印出来
#######结果输出########
A B C D
2019-01-01 0 1 2 3
2019-01-02 4 5 6 7
2019-01-03 8 9 10 11
2019-01-04 12 13 14 15
2019-01-05 16 17 18 19
2019-01-06 20 21 22 23
2019-01-01 0
2019-01-02 4
2019-01-03 8
2019-01-04 12
2019-01-05 16
2019-01-06 20
Freq: D, Name: A, dtype: int32
或者: 2019-01-01 0
2019-01-02 4
2019-01-03 8
2019-01-04 12
2019-01-05 16
2019-01-06 20
Freq: D, Name: A, dtype: int32
A B C D
2019-01-02 4 5 6 7
2019-01-03 8 9 10 11
A B C D
2019-01-02 4 5 6 7
2019-01-03 8 9 10 11
2019-01-04 12 13 14 15
#select by label:loc (location) 通过标签索引来选
A 4
B 5
C 6
D 7
Name: 2019-01-02 00:00:00, dtype: int32
#print(df.loc[:,['A','B']]) 打印出A,B两列
A B
2019-01-01 0 1
2019-01-02 4 5
2019-01-03 8 9
2019-01-04 12 13
2019-01-05 16 17
2019-01-06 20 21
#通过行列坐标来选
13
#
C
2019-01-02 6
2019-01-03 10
#判断选择
[ A B C D
2019-01-04 12 13 14 15
2019-01-05 16 17 18 19
2019-01-06 20 21 22 23]
pandas 设置值
import numpy as np
import pandas as pd
dates = pd.date_range('20190101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
#通过坐标选择后赋值修改
df.iloc[2,2] = 223 #修改第3行第3列的值为223
print(df)
#通过索引标签
df.loc['20190102','B'] = 567
print(df)
#通过判断条件
df.A[df.A>=8] = 0
print(df)
#添加新的一列
df['F'] = [1,2,3,4,5,6] #or np.nan
print(df)
######输出结果########
A B C D
2019-01-01 0 1 2 3
2019-01-02 4 5 6 7
2019-01-03 8 9 10 11
2019-01-04 12 13 14 15
2019-01-05 16 17 18 19
2019-01-06 20 21 22 23
#通过坐标选择后赋值修改
A B C D
2019-01-01 0 1 2 3
2019-01-02 4 5 6 7
2019-01-03 8 9 223 11
2019-01-04 12 13 14 15
2019-01-05 16 17 18 19
2019-01-06 20 21 22 23
#通过索引标签
A B C D
2019-01-01 0 1 2 3
2019-01-02 4 567 6 7
2019-01-03 8 9 223 11
2019-01-04 12 13 14 15
2019-01-05 16 17 18 19
2019-01-06 20 21 22 23
#通过判断条件
A B C D
2019-01-01 0 1 2 3
2019-01-02 4 567 6 7
2019-01-03 0 9 223 11
2019-01-04 0 13 14 15
2019-01-05 0 17 18 19
2019-01-06 0 21 22 23
#添加新的一列
A B C D F
2019-01-01 0 1 2 3 1
2019-01-02 4 567 6 7 2
2019-01-03 0 9 223 11 3
2019-01-04 0 13 14 15 4
2019-01-05 0 17 18 19 5
2019-01-06 0 21 22 23 6
pandas处理丢失数据
import numpy as np
import pandas as pd
dates = pd.date_range('20190101',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
#丢弃 dropna
print(df.dropna(axis=0,how='any')) #axis=0时丢掉行,how='any'时只要一行中有一个nan就丢掉,how='all'时表示当一行中全为nan时才丢掉
print(df)
#填充 fillna
print(df.fillna(value=0))
#判断是否是nan
print(df.isna()) #直接打印出真假表格
print(np.any(df.isnull() == True)) #打印出真假值,判断数据中是否有nan
#######输出结果##########
A B C D
2019-01-01 0 1 2 3
2019-01-02 4 5 6 7
2019-01-03 8 9 10 11
2019-01-04 12 13 14 15
2019-01-05 16 17 18 19
2019-01-06 20 21 22 23
A B C D
2019-01-01 0 NaN 2.0 3
2019-01-02 4 5.0 NaN 7
2019-01-03 8 9.0 10.0 11
2019-01-04 12 13.0 14.0 15
2019-01-05 16 17.0 18.0 19
2019-01-06 20 21.0 22.0 23
#丢弃 dropna
A B C D
2019-01-03 8 9.0 10.0 11
2019-01-04 12 13.0 14.0 15
2019-01-05 16 17.0 18.0 19
2019-01-06 20 21.0 22.0 23
A B C D
2019-01-01 0 NaN 2.0 3
2019-01-02 4 5.0 NaN 7
2019-01-03 8 9.0 10.0 11
2019-01-04 12 13.0 14.0 15
2019-01-05 16 17.0 18.0 19
2019-01-06 20 21.0 22.0 23
#填充 fillna
A B C D
2019-01-01 0 0.0 2.0 3
2019-01-02 4 5.0 0.0 7
2019-01-03 8 9.0 10.0 11
2019-01-04 12 13.0 14.0 15
2019-01-05 16 17.0 18.0 19
2019-01-06 20 21.0 22.0 23
#判断是否是nan
A B C D
2019-01-01 False True False False
2019-01-02 False False True False
2019-01-03 False False False False
2019-01-04 False False False False
2019-01-05 False False False False
2019-01-06 False False False False
#
True
pandas导入导出数据
import pandas as pd
#读取
data = pd.read_csv('C:\\Users\\17806\\OneDrive\\桌面\\Study\\MyProgram\\student.csv')
print(data)
#存储
data.to_pickle('C:\\Users\\17806\\OneDrive\\桌面\\Study\\MyProgram\\student.pickle')
#####输出结果#######
Student ID name age gender
0 1100 Kelly 22 Female
1 1101 Clo 21 Female
2 1102 Tilly 22 Female
3 1103 Tony 24 Male
4 1104 David 20 Male
5 1105 Catty 22 Female
6 1106 M 3 Female
pandas连接FataFrame concatenating
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
#concatenating
res = pd.concat([df1,df2,df3],axis=0)
print(res)
#忽略已有的横向标签索引,重新排序
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print(res)
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','f','e'],index=[2,3,4])
print(df1)
print(df2)
#join,['inner','outer'] join默认为outer
res = pd.concat([df1,df2],join='outer') #列索引相同的归为一列,不同的则为nan
print(res)
res = pd.concat([df1,df2],join='inner') #列索引相同的归为一列,不同的剔除
print(res)
#reindex
res = pd.concat([df1,df2],axis=1)
print(res)
res = pd.concat([df1,df2],axis=1).reindex(df1.index) #将横向索引按照df1重新排序
print(res)
#append 向下追加数据,以DataFrame为单元
res = df1.append(df2,ignore_index=True)
print(res)
res = df1.append([df2,df3],ignore_index=True)
print(res)
#追加一行数据 Series
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
res = df1.append(s1,ignore_index=True)
print(res)
#########输出结果#######
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
#concatenating
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
#ignore_index=True
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c f e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
#列索引相同的归为一列,不同的则为nan
a b c d e f
1 0.0 0.0 0.0 0.0 NaN NaN
2 0.0 0.0 0.0 0.0 NaN NaN
3 0.0 0.0 0.0 0.0 NaN NaN
2 NaN 1.0 1.0 NaN 1.0 1.0
3 NaN 1.0 1.0 NaN 1.0 1.0
4 NaN 1.0 1.0 NaN 1.0 1.0
#列索引相同的归为一列,不同的则剔除
b c
1 0.0 0.0
2 0.0 0.0
3 0.0 0.0
2 1.0 1.0
3 1.0 1.0
4 1.0 1.0
a b c d b c f e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
#将横向索引按照df1重新排序
a b c d b c f e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
#append 向下追加数据,以DataFrame为单元
a b c d e f
0 0.0 0.0 0.0 0.0 NaN NaN
1 0.0 0.0 0.0 0.0 NaN NaN
2 0.0 0.0 0.0 0.0 NaN NaN
3 NaN 1.0 1.0 NaN 1.0 1.0
4 NaN 1.0 1.0 NaN 1.0 1.0
5 NaN 1.0 1.0 NaN 1.0 1.0
a b c d e f
0 0.0 0.0 0.0 0.0 NaN NaN
1 0.0 0.0 0.0 0.0 NaN NaN
2 0.0 0.0 0.0 0.0 NaN NaN
3 NaN 1.0 1.0 NaN 1.0 1.0
4 NaN 1.0 1.0 NaN 1.0 1.0
5 NaN 1.0 1.0 NaN 1.0 1.0
6 2.0 2.0 2.0 2.0 NaN NaN
7 2.0 2.0 2.0 2.0 NaN NaN
8 2.0 2.0 2.0 2.0 NaN NaN
#追加一行数据 Series
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
pandas合并DataFrame merge
import pandas as pd
import numpy as np
left = pd.DataFrame({
'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']
})
right = pd.DataFrame({
'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']
})
print(left)
print(right)
#merge how默认为inner,即没有的就剔除
res = pd.merge(left,right,on='key') #on表示按照哪一个列索引来合并
print(res)
left = pd.DataFrame({
'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']
})
right = pd.DataFrame({
'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']
})
print(left)
print(right)
#按照两个key索引来合并
res = pd.merge(left,right,on=['key1','key2'])
print(res)
########输出结果#########
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
#merge
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
#按照两个key索引来合并
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
pandas画图 plot
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Serise类数据
data1 = pd.Series(np.random.randn(1000),index=np.arange(1000))
data1 = data1.cumsum()
print(data1)
data1.plot()
#DataFrame类数据
data2 = pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
data2 = data2.cumsum()
data2.plot()
print(data2)
#图形显示方式:bar hist box kde area scatter hexbin pie
#散点图 scatter
data3 = data2.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data4 = data2.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2', ax=data3)
plt.show()
######输出结果#########
0 -0.931262
1 -0.459424
2 -0.164038
3 -0.196871
4 -0.902499
...
995 -7.521910
996 -7.621425
997 -10.207572
998 -10.859286
999 -11.072143
Length: 1000, dtype: float64
A B C D
0 -0.944536 1.715489 -0.197838 0.104654
1 -0.334956 1.261398 -1.197463 -0.645474
2 1.009486 0.777278 -1.625246 -0.436689
3 0.689034 1.317292 -1.726993 -0.038378
4 1.096938 4.138512 -2.609029 -0.158040
.. ... ... ... ...
995 -17.457821 29.327120 -36.986072 2.700634
996 -16.550651 29.218301 -35.862315 4.810438
997 -17.055353 29.476536 -35.520861 4.162424
998 -16.429215 29.605428 -36.791283 1.169698
999 -17.281577 29.707916 -35.212506 1.860799
[1000 rows x 4 columns]
Serise类数据
DataFrame类数据
散点图 scatter