Table of Contents
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
時(shí)間(戳)的格式化
data=pd.read_csv(r"D:\downloads\tempo使用1.csv",encoding='gbk')
data.head()
|
SYS_NAME |
CWXT_DB184C |
COLLECTTIME |
| 0 |
財(cái)務(wù)管理系統(tǒng) |
34270787.33 |
2014-10-01 |
| 1 |
財(cái)務(wù)管理系統(tǒng) |
34328899.02 |
2014-10-02 |
| 2 |
財(cái)務(wù)管理系統(tǒng) |
34327553.50 |
2014-10-03 |
| 3 |
財(cái)務(wù)管理系統(tǒng) |
34288672.21 |
2014-10-04 |
| 4 |
財(cái)務(wù)管理系統(tǒng) |
34190978.41 |
2014-10-05 |
pd.to_datetime(data['COLLECTTIME'])[0]
Timestamp('2014-10-01 00:00:00')
data['date']=pd.to_datetime(data['COLLECTTIME'],format="%Y-%m-%d")#數(shù)據(jù)格式化輸出
data.head()
|
SYS_NAME |
CWXT_DB184C |
COLLECTTIME |
date |
| 0 |
財(cái)務(wù)管理系統(tǒng) |
34270787.33 |
2014-10-01 |
2014-10-01 |
| 1 |
財(cái)務(wù)管理系統(tǒng) |
34328899.02 |
2014-10-02 |
2014-10-02 |
| 2 |
財(cái)務(wù)管理系統(tǒng) |
34327553.50 |
2014-10-03 |
2014-10-03 |
| 3 |
財(cái)務(wù)管理系統(tǒng) |
34288672.21 |
2014-10-04 |
2014-10-04 |
| 4 |
財(cái)務(wù)管理系統(tǒng) |
34190978.41 |
2014-10-05 |
2014-10-05 |
提取時(shí)間特征:年、月、日、季度等
data['quarter']=data['date'].dt.quarter#提取季度
data.head()
|
SYS_NAME |
CWXT_DB184C |
COLLECTTIME |
date |
quarter |
| 0 |
財(cái)務(wù)管理系統(tǒng) |
34270787.33 |
2014-10-01 |
2014-10-01 |
4 |
| 1 |
財(cái)務(wù)管理系統(tǒng) |
34328899.02 |
2014-10-02 |
2014-10-02 |
4 |
| 2 |
財(cái)務(wù)管理系統(tǒng) |
34327553.50 |
2014-10-03 |
2014-10-03 |
4 |
| 3 |
財(cái)務(wù)管理系統(tǒng) |
34288672.21 |
2014-10-04 |
2014-10-04 |
4 |
| 4 |
財(cái)務(wù)管理系統(tǒng) |
34190978.41 |
2014-10-05 |
2014-10-05 |
4 |
data['month'],data['day']=data['date'].dt.month,data['date'].dt.day#提取月、天
data.head()
|
SYS_NAME |
CWXT_DB184C |
COLLECTTIME |
date |
quarter |
month |
day |
| 0 |
財(cái)務(wù)管理系統(tǒng) |
34270787.33 |
2014-10-01 |
2014-10-01 |
4 |
10 |
1 |
| 1 |
財(cái)務(wù)管理系統(tǒng) |
34328899.02 |
2014-10-02 |
2014-10-02 |
4 |
10 |
2 |
| 2 |
財(cái)務(wù)管理系統(tǒng) |
34327553.50 |
2014-10-03 |
2014-10-03 |
4 |
10 |
3 |
| 3 |
財(cái)務(wù)管理系統(tǒng) |
34288672.21 |
2014-10-04 |
2014-10-04 |
4 |
10 |
4 |
| 4 |
財(cái)務(wù)管理系統(tǒng) |
34190978.41 |
2014-10-05 |
2014-10-05 |
4 |
10 |
5 |
data['dayofweek']=data['date'].dt.dayofweek#一周內(nèi)的第幾天
data['weekofyear']=data['date'].dt.week#一年中的第幾周
data.head()
|
SYS_NAME |
CWXT_DB184C |
COLLECTTIME |
date |
quarter |
month |
day |
dayofweek |
weekofyear |
| 0 |
財(cái)務(wù)管理系統(tǒng) |
34270787.33 |
2014-10-01 |
2014-10-01 |
4 |
10 |
1 |
2 |
40 |
| 1 |
財(cái)務(wù)管理系統(tǒng) |
34328899.02 |
2014-10-02 |
2014-10-02 |
4 |
10 |
2 |
3 |
40 |
| 2 |
財(cái)務(wù)管理系統(tǒng) |
34327553.50 |
2014-10-03 |
2014-10-03 |
4 |
10 |
3 |
4 |
40 |
| 3 |
財(cái)務(wù)管理系統(tǒng) |
34288672.21 |
2014-10-04 |
2014-10-04 |
4 |
10 |
4 |
5 |
40 |
| 4 |
財(cái)務(wù)管理系統(tǒng) |
34190978.41 |
2014-10-05 |
2014-10-05 |
4 |
10 |
5 |
6 |
40 |
a=pd.to_datetime('2020-1-1')
判斷開(kāi)始、結(jié)束
#判斷開(kāi)始、結(jié)束
print(a.is_year_end,
a.is_year_start,
a.is_month_end,
a.is_month_start,
a.is_quarter_start,
a.is_quarter_end)
False True False True True False
data['hour']=data['date'].dt.hour
data.head()
|
SYS_NAME |
CWXT_DB184C |
COLLECTTIME |
date |
quarter |
month |
day |
dayofweek |
weekofyear |
hour |
| 0 |
財(cái)務(wù)管理系統(tǒng) |
34270787.33 |
2014-10-01 |
2014-10-01 |
4 |
10 |
1 |
2 |
40 |
0 |
| 1 |
財(cái)務(wù)管理系統(tǒng) |
34328899.02 |
2014-10-02 |
2014-10-02 |
4 |
10 |
2 |
3 |
40 |
0 |
| 2 |
財(cái)務(wù)管理系統(tǒng) |
34327553.50 |
2014-10-03 |
2014-10-03 |
4 |
10 |
3 |
4 |
40 |
0 |
| 3 |
財(cái)務(wù)管理系統(tǒng) |
34288672.21 |
2014-10-04 |
2014-10-04 |
4 |
10 |
4 |
5 |
40 |
0 |
| 4 |
財(cái)務(wù)管理系統(tǒng) |
34190978.41 |
2014-10-05 |
2014-10-05 |
4 |
10 |
5 |
6 |
40 |
0 |
判斷是否是一天的高峰時(shí)段
#是否是一天的高峰時(shí)段
data['day_hign']=data['hour'].apply(lambda x:0 if 0<=x<8 else 1)
data.head()
|
SYS_NAME |
CWXT_DB184C |
COLLECTTIME |
date |
quarter |
month |
day |
dayofweek |
weekofyear |
hour |
day_hign |
| 0 |
財(cái)務(wù)管理系統(tǒng) |
34270787.33 |
2014-10-01 |
2014-10-01 |
4 |
10 |
1 |
2 |
40 |
0 |
0 |
| 1 |
財(cái)務(wù)管理系統(tǒng) |
34328899.02 |
2014-10-02 |
2014-10-02 |
4 |
10 |
2 |
3 |
40 |
0 |
0 |
| 2 |
財(cái)務(wù)管理系統(tǒng) |
34327553.50 |
2014-10-03 |
2014-10-03 |
4 |
10 |
3 |
4 |
40 |
0 |
0 |
| 3 |
財(cái)務(wù)管理系統(tǒng) |
34288672.21 |
2014-10-04 |
2014-10-04 |
4 |
10 |
4 |
5 |
40 |
0 |
0 |
| 4 |
財(cái)務(wù)管理系統(tǒng) |
34190978.41 |
2014-10-05 |
2014-10-05 |
4 |
10 |
5 |
6 |
40 |
0 |
0 |
構(gòu)造時(shí)間特征
#構(gòu)造過(guò)去n天的統(tǒng)計(jì)數(shù)據(jù)
def get_statis_n_days_num(data,col,n):
temp=pd.DataFrame()
for i in range(n):
temp=pd.concat([temp,data[col].shift((i+1)*24)],axis=1)
data['avg_'+str(n)+"_days_"+col]=temp.mean(axis=1)
data['median_'+str(n)+"days"+col]=temp.median(axis=1)
data['max_'+str(n)+"_days_"+col]=temp.max(axis=1)
data['min_'+str(n)+"_days_"+col]=temp.min(axis=1)
data['std'+str(n)+"_days_"+col]=temp.std(axis=1)
return data
get_statis_n_days_num(data,'CWXT_DB184C',n=7).tail()
d:\software\python\lib\site-packages\numpy\lib\nanfunctions.py:1114: RuntimeWarning: All-NaN slice encountered
overwrite_input=overwrite_input)
|
SYS_NAME |
CWXT_DB184C |
COLLECTTIME |
date |
quarter |
month |
day |
dayofweek |
weekofyear |
hour |
day_hign |
avg_7_days_CWXT_DB184C |
median_7daysCWXT_DB184C |
max_7_days_CWXT_DB184C |
min_7_days_CWXT_DB184C |
std7_days_CWXT_DB184C |
| 36 |
財(cái)務(wù)管理系統(tǒng) |
35606941.11 |
2014-11-06 |
2014-11-06 |
4 |
11 |
6 |
3 |
45 |
0 |
0 |
34328674.80 |
34328674.80 |
34328674.80 |
34328674.80 |
NaN |
| 37 |
財(cái)務(wù)管理系統(tǒng) |
35546714.13 |
2014-11-07 |
2014-11-07 |
4 |
11 |
7 |
4 |
45 |
0 |
0 |
34234933.61 |
34234933.61 |
34234933.61 |
34234933.61 |
NaN |
| 38 |
財(cái)務(wù)管理系統(tǒng) |
35510966.73 |
2014-11-08 |
2014-11-08 |
4 |
11 |
8 |
5 |
45 |
0 |
0 |
34022726.41 |
34022726.41 |
34022726.41 |
34022726.41 |
NaN |
| 39 |
財(cái)務(wù)管理系統(tǒng) |
35491498.51 |
2014-11-09 |
2014-11-09 |
4 |
11 |
9 |
6 |
45 |
0 |
0 |
35016309.47 |
35016309.47 |
35016309.47 |
35016309.47 |
NaN |
| 40 |
財(cái)務(wù)管理系統(tǒng) |
35601990.55 |
2014-11-10 |
2014-11-10 |
4 |
11 |
10 |
0 |
46 |
0 |
0 |
34981412.82 |
34981412.82 |
34981412.82 |
34981412.82 |
NaN |
時(shí)間差及其轉(zhuǎn)換
a=pd.to_datetime('2020-07-21 12:12:01')
b=pd.to_datetime('2020-07-20 12:12:23')
c=pd.to_datetime('2020-07-21 00:12:01')
d=pd.to_datetime('2020-07-21 01:00:01')
a-b#計(jì)算時(shí)間差
Timedelta('0 days 23:59:38')
(a-c)
Timedelta('0 days 12:00:00')
#將時(shí)間差,轉(zhuǎn)換成秒
(a-c).seconds
43200
(d-c).seconds
2880
# 跨天轉(zhuǎn)化秒時(shí)容易出錯(cuò)
(a-b).seconds
86378
def calTimesDiff(t1, t2):
"""
計(jì)算時(shí)間戳之間的差值,單位h(同一天)
:param t1:
:param t2:
:return:
"""
# 1.先轉(zhuǎn)成秒
# 2.再將秒轉(zhuǎn)換成小時(shí)
t1_seconds = str(t1).split()[-1].split(":")
t1_seconds = sum([int(i) * 60 ** (2 - num) for num, i in enumerate(t1_seconds)])
t2_seconds = str(t2).split()[-1].split(":")
t2_seconds = sum([int(i) * 60 ** (2 - num) for num, i in enumerate(t2_seconds)])
return abs(round((t2_seconds - t1_seconds) / 3600, 3))
calTimesDiff(d,c)
0.8
a-b#計(jì)算時(shí)間差
Timedelta('0 days 23:59:38')
#將a-b轉(zhuǎn)換成小時(shí)
def convertTimestampToSec(a_b):
"""
將時(shí)間差轉(zhuǎn)換成秒
"""
a_b_str=str(a_b).split()
day=int(a_b_str[0])
sec=sum([int(i)*60**(2-num) for num,i in enumerate(a_b_str[-1].split(":"))])
res=day*24*3600+sec
return res
convertTimestampToSec(a-b)#23*3600+59*60+38
86378
時(shí)間的前進(jìn)與后退
pd.Timedelta
help(pd.Timedelta)
class Timedelta(_Timedelta)
| Represents a duration, the difference between two dates or times.
|
| Timedelta is the pandas equivalent of python's ``datetime.timedelta``
| and is interchangeable with it in most cases.
|
| Parameters
| ----------
| value : Timedelta, timedelta, np.timedelta64, string, or integer
| unit : str, default 'ns'
| Denote the unit of the input, if input is an integer.
|
| Possible values:
|
| * 'Y', 'M', 'W', 'D', 'T', 'S', 'L', 'U', or 'N'
| * 'days' or 'day'
| * 'hours', 'hour', 'hr', or 'h'
| * 'minutes', 'minute', 'min', or 'm'
| * 'seconds', 'second', or 'sec'
| * 'milliseconds', 'millisecond', 'millis', or 'milli'
| * 'microseconds', 'microsecond', 'micros', or 'micro'
| * 'nanoseconds', 'nanosecond', 'nanos', 'nano', or 'ns'.
a+pd.Timedelta(2,unit='days')
Timestamp('2020-07-23 12:12:01')
a+pd.Timedelta(2,unit='m')
Timestamp('2020-07-21 12:14:01')
pd.DateOffset
help(pd.DateOffset)`也具有的類似的功能:但是需要注意關(guān)鍵詞單復(fù)數(shù)的區(qū)別:
Parameters that **add** to the offset (like Timedelta):
|
| - years
| - months
| - weeks
| - days
| - hours
| - minutes
| - seconds
| - microseconds
| - nanoseconds
|
Parameters that **replace** the offset value:
|
| - year
| - month
| - day
| - weekday
| - hour
| - minute
| - second
| - microsecond
| - nanosecond.
a+pd.DateOffset(days=1)
Timestamp('2020-07-22 12:12:01')
b+pd.DateOffset(day=1)
Timestamp('2020-07-01 12:12:23')