深度学习练习1-excel/csv文件数据转成pytorch张量导入代码
3.1. 需安装库(使用清华源安装速度会提升)
pip install openpyxl -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install tourch torchvision torchaudio -i https://pypi.tuna.tsinghua.edu.cn/simple
2.导入库
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np
import pandas
3.1读取excel方法
class ExcelDataset(Dataset): #excel实现
def __int__(self, filepath="data.xlsx",sheet_name=0): #_int_加载数据;定义属性变量
#传入文件路径;如果有多个sheet,可以加入sheet_name这个参数,默认是0==读的是第一个表单
print(f"reading {filepath},sheet={sheet_name}")
#打印日志,显示正在读取的文件和表单
df = pandas.read_excel( #读取excel数据
filepath,header=0,index_col=0, #传入excel文件名,header=0--第0行是标题行,index_col=0--第0列是索引列(如序号等),使其不作为特征使用
names=['feat1','feat2','label'], #以names数组的形式表示要读取三列的数据名称
sheet_name=sheet_name, #读取的表单
dtype={"feat1":np.float32,"feat2":np.float32,"label":np.int32} #names读取后希望转变的数据类型
)
print(f"the shape of dataframe is {df.shape}") #打印日志,显示有多少行多少列
feat = df.iloc[:,:2].values #【iloc相当于索引 .iloc[行,列]】切片取做x
label = df.iloc[:,2].values #切片取做y
self.x = torch.from_numpy(feat) #得到张量x
self.y = torch.from_numpy(label) #得到张量y
def __len__(self): #_len_ 数据集大小是多少(如果是图片,即为多少张图片;文本,一共有多少行句子),因为self.y是一维,返回即可得到数据量
return len(self.y)
def __getitem__(self,index): #_getitem_ 提供给dataloader使用,index-->索引,返回元组,有监督学习为x,y 无监督x
return self.x[index], self.y[index]
if __name__ == '__main__':
print("Test for ExcelDataset")
excel_dataset = ExcelDataset(sheet_name="corpus1") #读取表单1
# excel_dataset = ExcelDataset(sheet_name="corpus2") #读取表单2
# excel_dataset = ExcelDataset(sheet_name=None) #读取所有表单,此时df成为包含corpus1和2的字典
excel_dataloader = DataLoader(excel_dataset, batch_size=8, shuffle=True) #每八个数据打包
for idx, (batch_x,batch_y) in enumerate(excel_dataloader):
print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}") #打印出来
print(batch_x,batch_y)
##以下是伪代码(调取数据后可导入)
output = model(batch_x)
loss = criterion(output, batch_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
3.2.2 csv读取方法1
class CsvDataset(Dataset):
def __int__(self, filepath="data.csv"): #csv没有表单名这个定义,so不需要,只要传文件名
print(f"reading {filepath}")
df = pandas.read_csv(
filepath,header=0,index_col=0,
encoding = 'utf-8', #csv的编码形式
names=['feat1','feat2','label'],
dtype={"feat1":np.float32,"feat2":np.float32,"label":np.int32},
skip_blank_lines=True #跳过空白行
)
print(f"the shape of dataframe is {df.shape}") #打印日志,显示有多少行多少列
feat = df.iloc[:,:2].values #【iloc相当于索引 .iloc[行,列]】切片取做x
label = df.iloc[:,2].values #切片取做y
self.x = torch.from_numpy(feat) #得到张量x
self.y = torch.from_numpy(label) #得到张量y
def __len__(self):
return len(self.y)
def __getitem__(self, index):
return self.x[index], self.y[index]
if __name__ == '__main__':
print("Test for CsvDataset")
csv_dataset = CsvDataset()
csv_dataloader = DataLoader(csv_dataset, batch_size=8, shuffle=True) #每八个数据打包
for idx, (batch_x,batch_y) in enumerate(csv_dataloader):
print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}") #打印出来
print(batch_x,batch_y)
##以下是伪代码(调取数据后可导入)
output = model(batch_x)
loss = criterion(output, batch_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
3.2.2 csv 方法2
class CsvDataset(Dataset): #以读取文本文献的形式读取csv文件,暴力读取
def __int__(self, filepath="data.csv"):
print(f"reading {filepath}")
with open(filepath, encoding = 'utf-8') as f: #把csv当作txt文件处理
lines = f.readlines() #以文本的形式逐行读取
feat = []
label = []
for line in lines[1:]:
values = line.strip().split(',') #strip跳过空行、换行符;split以逗号分割
row_feat = [float(v) for v in values[1:3]]
row_label = int(values[3])
feat.append(row_feat)
label.append(row_label)
feat = np.array(feat,dtype=np.float32) #【iloc相当于索引 .iloc[行,列]】切片取做x
label = np.array(label,dtype=np.int32) #切片取做y
self.x = torch.from_numpy(feat) #得到张量x
self.y = torch.from_numpy(label) #得到张量y
def __len__(self):
return len(self.y)
def __getitem__(self, index):
return self.x[index], self.y[index]
if __name__ == '__main__':
print("Test for CsvDataset")
csv_dataset = CsvDataset()
csv_dataloader = DataLoader(csv_dataset, batch_size=8, shuffle=True) #每八个数据打包
for idx, (batch_x,batch_y) in enumerate(csv_dataloader):
print(f"batch_id:{idx}, {batch_x.shape}, {batch_y.shape}") #打印出来
print(batch_x,batch_y)
##以下是伪代码(调取数据后可导入)
output = model(batch_x)
loss = criterion(output, batch_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
注:内容来自b站自学整理, up名为deep_thoughts 课程52