
6
PS 程式碼後面沒有print(),是因為我用jupyter notebook,如果寫在.py檔,最後印出記得要用 print(包起來)
PS 讀取檔案改絕對路徑
file_path = r'D:\data.csv'
data = pd.DataFrame(pd.read_csv(file_path))
import pandas as pd
import numpy as np
# list -> Series
my_list = [1, 2, 3, 4, 5]
series_from_list = pd.Series(data=my_list)
series_from_list

可以指定label
import pandas as pd
import numpy as np
my_data = [10,20,30]
index_ = ['a','b', 'c']
pd.Series(my_data, index_)

import pandas as pd
import numpy as np
my_data= [1,2,3,4]
index_ = ['USA','Taiwan','Mexico','Japan']
pd.Series(my_data, index_)

計算新的list
function_list = [sum, np.mean, len]
my_data = [10, 20, 30]
result_list = [func(my_data) for func in function_list]
result_list

import pandas as pd
import numpy as np
# dict -> Series
my_dict = {'a': 100, 'b': 200, 'c': 300}
series_from_dict = pd.Series(data=my_dict)
series_from_dict

import pandas as pd
import numpy as np
# NumPy Array -> Series
my_array = np.array([10, 20, 30, 40, 50])
series_from_array = pd.Series(data=my_array)
series_from_array

import pandas as pd
# lists -> DataFrame
my_data_list = [['Eva', 22], ['Cata', 27], ['Charlie', 35]]
columns_list = ['Name', 'Age']
df_from_list = pd.DataFrame(data=my_data_list, columns=columns_list)
df_from_list

import pandas as pd
# dicts -> DataFrame
my_data_dict = {'Name': ['Eva', 'Cata', 'Charlie'], 'Age': [25, 30, 35]}
df_from_dict = pd.DataFrame(data=my_data_dict)
df_from_dict

import pandas as pd
import numpy as np
# array_2d -> DataFrame
my_data_array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
columns_array = ['A', 'B', 'C']
df_from_array = pd.DataFrame(data=my_data_array, columns=columns_array)
df_from_array

import pandas as pd
import numpy as np
index_ = ['USA','Taiwan','Mexico','Japan']
my_data= [1,2,3,4]
ser = pd.Series(my_data, index_)
# 取 index
ser['Mexico']

import pandas as pd
import numpy as np
ser1 = pd.Series([1,2,3,4],['USA','Taiwan','Mexico','Japan'])
ser2 = pd.Series([1,2,3,4],['USA','Taiwan','Korean','Japan'])
ser1+ser2

import pandas as pd
# 建立 Series
d = pd.Series([1, 2, 3, 4, 5])
print("Original Series:")
print(d)
transposed_series = d.T
transposed_series[1]=9
print("\nTransposed Series:")
print(transposed_series)

從 “2000-01-01” 開始,以每年(”Y”)為間隔
datetime_series = pd.Series(
pd.date_range("2000-01-01", periods=3, freq="Y")
)
datetime_series

import pandas as pd
data = {'年齡': [25, 30, 35],
'收入': [50000, 60000, 70000]}
df = pd.DataFrame(data)
# 計算 DataFrame 的大小
df_size = df.size
print("DataFrame size:", df_size)
print()
print(df)

import numpy as np
df = pd.DataFrame(np.random.randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df_size = df.size
print("DataFrame size:", df_size)
print()
print(df)

print(df['W'])
print()
print(df[['W','Z']])

可以直接加入新的欄位
df['W_X'] = df['W'] + df['X']
df['W_Y'] = df['W'] + df['Y']
df


# 沿用上方df
df.describe()

# 沿用上方df
df.info()

布林值判斷
# 延續上方 df
booldf = df > 0
booldf

顯示True的
# 延續上方 df
df[booldf]

也可以和在一起寫: 布林值判斷+顯示True的
# 延續上方 df
df[df > 0]

#延續上方 df
df.drop('W_Y',axis=1, inplace=True)
df

df_2 = df.drop('E',axis=0)
df_2

# 延續上方 df
df_2.shape

#延續上方 df
df_2.dtypes


df.loc['A']

df.loc['A'] > -1

df.loc['A'][df.loc['A'] > -1]

df[df['W'] > 0]

df['W'][df['W'] > 0] = df[df['W'] > 0]['W']

df[df['W'] > 0][['X','Y']]

df.loc['A','Y']

df.loc[['A','C'],['X','Y']]

df[(df['W']>0) | (df['Y']>1)]

hello = 'h e l l o'.split()
hello

hello = 'h e l l o'.split()
df['hello'] = hello
df.set_index('hello', inplace=True)
df

df.iloc[2]

df.iloc[2,3]

df.iloc[[0,2],[1,2]]

df.iloc[0:3, :]

import pandas as pd
import numpy as np
outside = ['G1', 'G1', 'G1', 'G1', 'G2', 'G2', 'G2', 'G2']
inside = [1,2,3,4,1,2,3,4]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

# 再轉df
df = pd.DataFrame(np.random.randn(8, 2), hier_index, ['A', 'B'])
df

df.xs(('G1'))

df.xs('G1').loc[3, 'A']

df.xs('G2').loc[2, 'B']

也可以直接loc一組
df.loc['G1']
df.loc['G1'].loc[2]
df.loc['G1'].loc[2]['B']

df.index.names = ['Groups','Num']

df.xs(key=3, level='Num', axis=0, drop_level=True)

先預設一個df
import pandas as pd
import numpy as np
d = {'A':[1,2,np.nan], 'B':[3,4,5], 'C':[5,np.nan,np.nan]}
df = pd.DataFrame(d)
df

df.dropna()

df.dropna(thresh=2)

df.dropna(axis=1)

df.fillna(value='FILL VALUE')

df['A'].fillna(value="hi")

df['A'].fillna(value='hi', inplace=True)
df

先預設一個df
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Cata','Chris','Emma','Cata','John','Allen'],
'Sales':[200,120,150,160,350,240]
}
df = pd.DataFrame(data)
df

df.groupby
求每間公司被投資額總額
df.groupby('Company')['Sales'].sum()

求每間公司被投資額總平均/人
df.groupby('Company')['Sales'].mean()

求每個人總投資額
df.groupby('Person')['Sales'].sum()

求每間公司被投資最高額和擁有者
df.groupby('Company')['Sales'].max()

PS 如果剛好每個欄位都要,可以不用選欄位名稱
df.groupby('Company').max()

假設金額是美金,要算成台幣匯率,求每人台幣總花費
先新增一籃,放置換算為台幣的值
def multi_30(p):
return p*30
df['Sales_in_TWD'] = df['Sales'].apply(multi_30)
# 可以捨棄def,直接用lambda寫
# df['Sales_in_TWD_2'] = df.groupby('Company')['Sales'].sum().apply(lambda p: p*30)
df

再分組
df.groupby('Person')['Sales_in_TWD'].sum().sort_values(ascending=False)

求FB Sales 的總額
df.groupby('Company')['Sales'].sum().loc['FB']

PS 如果剛好每個欄位都要,可以不用選欄位名稱
df.groupby('Company').sum().loc['FB']

求FB Sales 的標準差
df.groupby('Company')['Sales'].std()

df.groupby('Company').count()

df.groupby('Company')['Sales'].idxmax()

df.groupby('Company').describe()

df.groupby('Company').describe().transpose()['FB']

df.isnull()

沿用上方df
df.loc[df['Person'] == 'Cata', 'Person'] = 'Catalina'
#等同於
# bool = df['Person'] == 'Cata'
# df.loc[bool , 'Person'] = 'Catalina'
#會印出

PS 假設多欄位要改替換文字內容,用replace
“`=
columns = [‘column1′,’column2′,’colimn3’]
df[columns] = df[columns].replace(‘~’,’-‘,regex=True)
df[columns] = df[columns].replace(‘*’,’x’,regex=True)
df
<br/>
### 求不重複值 .unique()、 求不重複值數量 .nunique()
求不重複值
```=
df['Person'].unique()

求不重複值數量
df['Person'].nunique()

df['Person'].value_counts()

PS 如果用GROUP BY
df.groupby('Person').count()
下條件
df['Person'].value_counts()>=2

PS 如果用GROUP BY
df.groupby(‘Person’).size() >= 2#會印出
求至少出現兩次的Person
counts = df['Person'].value_counts()
result = counts[counts >= 2]
result
# 可以寫在一起
df['Person'].value_counts()[df['Person'].value_counts() >= 2]

求至少出現兩次的Person詳細資料
mask = df['Person'].isin(result.index)
df[mask]

ascending=True 升序
ascending=False 降序
mask = df['Person'].isin(result.index)
df[mask].sort_values('Sales', ascending=False)

設兩張df
left = pd.DataFrame({
'key':['k0','k1','k2','k3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']
})
right = pd.DataFrame({
'key':['k0','k1','k2','k3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','C2','D3']
})

pd.merge(left,right, how='inner', on='key')

設兩張df
left_2 = pd.DataFrame({
'key1':['k0','k0','k1','k2'],
'key2':['k0','k1','k0','k1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']
})
right_2 = pd.DataFrame({
'key1':['k0','k1','k1','k2'],
'key2':['k0','k0','k0','k0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']
})

pd.merge(left_2, right_2, on=['key1','key2'])

保留所有行,填充缺失值
pd.merge(left_2, right_2, how='outer', on=['key1','key2'])

保留 left 所有,right 沒有的用nan補
pd.merge(left_2, right_2, how='left', on=['key1','key2'])

保留 right 所有,left 沒有的用nan補
pd.merge(left_2, right_2, how='right', on=['key1','key2'])

設兩張df
left_3 = pd.DataFrame({
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']},
index=['k0','k1','k2','k3']
)
right_3 = pd.DataFrame({
'C':['C0','C2','C3','C4'],
'D':['D0','D2','D3','D4']},
index=['k0','k2','k3','k4']
)

LEFT JOIN 左交集
left_3.join(right_3)

RIGHT JOIN 右交集
right_3.join(left_3)

pd.concat([left_3, right_3])

先創一個df
import numpy as np
import pandas as pd
random_array = np.random.randint(1,101,(6,5))
df = pd.DataFrame(random_array)
df

賦予欄位/列位名稱
df.columns= 'h e l l p'.split()
df.index = ['a','b','c','d','e','f']
df

df.columns
left_3 = pd.DataFrame({
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']},
index=['k0','k1','k2','k3']
)
right_3 = pd.DataFrame({
'C':['C0','C2','C3','C4'],
'D':['D0','D2','D3','D4']},
index=['k0','k2','k3','k4']
)
# 訪問DataFrame的列名
column_names = left_3.columns
# 列名
print(column_names)

df.index
left_3 = pd.DataFrame({
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']},
index=['k0','k1','k2','k3']
)
right_3 = pd.DataFrame({
'C':['C0','C2','C3','C4'],
'D':['D0','D2','D3','D4']},
index=['k0','k2','k3','k4']
)
# 訪問DataFrame的索引
left_index = left_3.index
right_index = right_3.index
#
print("Left DataFrame Index:")
print(left_index)
print("\nRight DataFrame Index:")
print(right_index)

觀念
import numpy as np
df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=['a', 'b'])
df

df.quantile([.1, .5])

# 百分位數
# 第10%分位数 = 第一个已知值 + (位置 - 第一个已知位置) * (第二个已知值 - 第一个已知值) / (第二个已知位置 - 第一个已知位置)
'a'
# 10% * (n-1) = 0.1 * (4-1) = 0.1 * 3 = 0.3
# 0.3落在第一个值1和第二个值2之间
# 第10%分位数 = 1 + (0.3 - 0) * (2 - 1) / (1 - 0) = 1 + 0.3 * 1 = 1 + 0.3 = 1.3
# 50% * (n-1) = 0.5 * (4-1) = 0.5 * 3 = 1.5
# 1.5落在第二个值2和第三个值3之间
# 第50%分位数 = 2 + (1.5 - 1) * (3 - 2) / (2 - 1) = 2 + 0.5 * 1 = 2 + 0.5 = 2.5
'b'
# 10% * (n-1) = 0.1 * (4-1) = 0.1 * 3 = 0.3
# 0.3落在第一个值1和第二个值10之间
# 第10%分位数 = 1 + (0.3 - 0) * (10 - 1) / (1 - 0) = 1 + 0.3 * 9 = 1 + 2.7 = 3.7
# 50% * (n-1) = 0.5 * (4-1) = 0.5 * 3 = 1.5
# 1.5落在第二个值10和第三个值100之间
# 第50%分位数 = 10 + (1.5 - 1) * (100 - 10) / (2 - 1) = 10 + 0.5 * 90 = 10 + 45 = 55.0'
計算房屋價格的第25個和第75個百分位數
import pandas as pd
# 創建一個示例數據框
data = {
'面積': [1200, 1500, 1800, 2000, 2200, 2400, 2600, 2800, 3000, 3500],
'價格': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700]
}
df = pd.DataFrame(data)
# 計算房屋價格的第25個和第75個百分位數
percentiles = df['價格'].quantile([0.25, 0.75])
print(percentiles)

使用seaborn載入tips資料
import seaborn as sns
import pandas as pd
tips = sns.load_dataset('tips')
df = pd.DataFrame(tips)
df.head()

index,改內容數字,把index2的’tip’, 改成1.55
df.at[2, 'tip'] = 1.55
df.head()

PS 找到所有選項
df['time'].cat.categories
搜尋,改內容 所有符合的,16.99 -> 17.99
df.loc[df['total_bill'] == 16.99, 'total_bill'] = 17.99
df.head()

搜尋,改內容 前五個符合的
dinner_indices = df[df['time'] == 'Dinner'][:5].index
df.loc[dinner_indices, 'time'] = 'Lunch'
df

合併兩張表格
df_merged = pd.merge(df_1,df_2,left_on='column_name', right_on='column_name')
df_merged
篩選出特定日期的資料
df['column_name'] = pd.to_datetime(df['column_name'])
filtered_data = (df['column_name']>='2022-01-01') & (df['column_name']<='2022-01-30')
df_0101_0130 = df[filtered_data]
df_0101_0130
計算出佔比
skincare = df_0101_0130[df_0101_0130['系列']=='保養品']
skincare_sum = skincare['amount'].sum()
sum_amount = df_0101_0130['amount'].sum()
round(skincare_sum / sum_amount *100, 2)
查看品牌銷售數量排行
toner = df_0101_0130[df_0101_0130['系列']=='面霜']
toner.groupby('品牌')['amount'].sum().sort_values(ascending=False)
假設我要提取出整理後的表格
agg_metrics = toner.groupby('品牌').agg({'amount':'sum',
'invsum':'nuniqe',
'unitprice':'mean'})
# 重新命名 .columns
agg_metrics.columns = ['total_amount','total_number_of_invoices','average_product_unit_price']
# 降序排列,取前十筆資料
agg_metrics['average_product_unit_price'].astype(int)
agg_metrics.sort_values(by='total_amount', ascending=False).head(10)
讀取表格內容,google爬蟲搜尋
df = pd.read_excel(r"D:\@@@\@@@.xlsx)
import requests
from ds4 import BeautifulSoup
try:
for index, row in df.iterrows():
code = row['條碼']
response = requests.get(f"https://www.googlw.com/searc?q={code}")
soup = BeautifilSoup(response.content,'html.psrser')
title = soup.find('h3').text
# print(title)
df.loc[index,'搜尋結果'] = titile
except:
pass
df
假設要算出銷售額和利潤的平均值
(Pandas -> NumPy),data[‘ ‘].values
再用NumPy計算,np.mean( )
# 方法一 (如果要做更多數學運算,先轉NumPy)
import numpy as np
import pandas as pd
# 讀取CSV文件
data = pd.DataFrame(pd.read_csv('sales_data.csv'))
# 提取Sales和Profit列 -> 轉換為NumPy陣列
sales_array = data['Sales'].values
profit_array = data['Profit'].values
# NumPy計算銷售額和利潤的平均值
mean_sales = np.mean(sales_array)
mean_profit = np.mean(profit_array)
print(f"平均銷售額:{mean_sales}")
print(f"平均利潤:{mean_profit}")
假設要算出銷售額和利潤的平均值,直接用Pandas計算,data[‘ ‘].mean()
# 方法二 (做單純計算用這個就夠)
import pandas as pd
# 讀取CSV文件
data = pd.DataFrame(pd.read_csv('sales_data.csv'))
# Pandas計算銷售額和利潤的平均值
mean_sales = data['Sales'].mean()
mean_profit = data['Profit'].mean()
print(f"平均銷售額:{mean_sales}")
print(f"平均利潤:{mean_profit}")
(Pandas -> NumPy),data.values
再用 NumPy 計算,np.sum( )
再轉回 Pandas,pd.DataFrame({‘Column_Sums’: column_sums})
import numpy as np
import pandas as pd
# 創建一個簡單的 Pandas DataFrame
data = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': [6, 7, 8, 9, 10]})
# 將 整份 Pandas DataFrame 轉換為 NumPy 數組
numpy_array = data.values
# 在 NumPy 數組上進行統計運算,例如計算每列的總和
column_sums = np.sum(numpy_array, axis=0)
# 將計算結果轉換回 Pandas DataFrame
result = pd.DataFrame({'Column_Sums': column_sums})
# 打印計算結果
print(result)
承上,代入csv檔
import numpy as np
import pandas as pd
# 讀取CSV文件
# 縮寫 data = pd.read_csv('sales_data.csv')
# 不同檔案類型,pd.read_excel(' ', sheetname=' ')、pd.read.html(' ')
data = pd.DataFrame(pd.read_csv('sales_data.csv'))
# 提取Sales和Profit列 -> 轉換為NumPy陣列
sales_array = data['Sales'].values
profit_array = data['Profit'].values
# 使用NumPy計算銷售額和利潤的平均值
mean_sales = np.mean(sales_array)
mean_profit = np.mean(profit_array)
# 使用NumPy計算銷售額和利潤的標準差
std_sales = np.std(sales_array)
std_profit = np.std(profit_array)
# 使用NumPy計算銷售額和利潤的中位數
median_sales = np.median(sales_array)
median_profit = np.median(profit_array)
# 使用NumPy計算銷售額和利潤的百分位數,例如第25和第75百分位數
percentile_25_sales = np.percentile(sales_array, 25)
percentile_75_sales = np.percentile(sales_array, 75)
percentile_25_profit = np.percentile(profit_array, 25)
percentile_75_profit = np.percentile(profit_array, 75)
# 創建一個包含計算結果的新 Pandas DataFrame
summary_data = pd.DataFrame({
'平均銷售額': [mean_sales],
'平均利潤': [mean_profit],
'銷售額標準差': [std_sales],
'利潤標準差': [std_profit],
'銷售額中位數': [median_sales],
'利潤中位數': [median_profit],
'銷售額第25百分位數': [percentile_25_sales],
'銷售額第75百分位數': [percentile_75_sales],
'利潤第25百分位數': [percentile_25_profit],
'利潤第75百分位數': [percentile_75_profit]
})
print(summary_data)
假設我讀取一個csv,用numpy轉換考試分數,比例不變下,所有數字都要在0~1之間
import numpy as np
import pandas as pd
# 讀取CSV文件
data = pd.DataFrame(pd.read_csv('data.csv'))
# 找到數據的最小值和最大值
min_value = data.min().min()
max_value = data.max().max()
# 縮放數據到0到1之間 # 等於 scaled_data/100
scaled_data = (data - min_value) / (max_value - min_value)
scaled_data
#如果要寫回新的CSV檔
scaled_data.to_csv('scaled_data.csv', index=False)
將 Pandas 對象 -> 為 NumPy 數組 -> .ravel()/.flatten()方法將二維轉為一維陣列
.values.ravel() = .values.flatten()
練習 : 從數組中抽一個整數隨機數
import numpy as np
import pandas as pd
data = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': [6, 7, 8, 9, 10]})
scaled_data = data.mean().astype(int)
random_value = np.random.choice(scaled_data.values.ravel())
random_value
# 會隨機印出一個數字