Reputation: 1
I am trying to process data but i constantly run on this Error:
Traceback (most recent call last):
File "d:\CSlab\biolabelandmethylevelmml.py", line 60, in <module>
cpg_data = read_result_split(folder_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "d:\CSlab\biolabelandmethylevelmml.py", line 52, in read_result_split
data_numpy = np.concatenate([data[:, :max_cols] for data in rawdata], axis=0)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 13.5 GiB for an array with shape (2088, 865919) and data type float64
Here is my code:
"""
基於'不健康者'的'CpG位點甲基化'&'生化指數'
建立'血液疾病'預測模型
"""
import pandas as pd
import os
import gc
import numpy as np
from tqdm import tqdm
from datetime import datetime
from sklearn.model_selection import train_test_split
#start
print('start at', datetime.now().strftime('%H:%M:%S'))
#get cpg data
folder_path = 'D:\\CSlab\\cs data\\result_split'
def read_result_split(folder_path):
# read file
print("Start read result split at: ", datetime.now().time())
file_names = os.listdir(folder_path)
with open(os.path.join(folder_path, os.listdir(folder_path)[0]), 'r',encoding="utf-8") as f:
ncols = len(f.readline().split(','))
# init
rawdata = []
for i in tqdm(range(len(file_names)), miniters=20):
file_name = file_names[i]
file_path = os.path.join(folder_path, file_name)
# get numbers of max columns
with open(file_path, 'r') as f:
ncols = len(f.readline().split(','))
# read csv
try:
df = pd.read_csv(file_path, header=None, usecols=list(range(1, ncols)), low_memory=False, on_bad_lines='skip')
df = df.fillna(0)
except Exception as e:
print(f"Cannot read file {file_path}, Error Message:{str(e)}")
continue
# clear
gc.collect()
# save to data
min_cols = min(ncols, len(df.columns))
df = df.iloc[:, :min_cols]
df.columns = ['Age'] + [str(i) for i in range(min_cols-1)]
rawdata.append(df.values)
print("Read file done at: ", datetime.now().time())
# list to numpy
max_cols = min([data.shape[1] for data in rawdata])
data_numpy = np.concatenate([data[:, :max_cols] for data in rawdata], axis=0)
# structuralize data
dtype = [('Age', 'i4')] + [(str(i), 'f8') for i in range(ncols)]
structured_data = np.core.records.fromarrays(data_numpy.transpose(), dtype=dtype)
return structured_data
cpg_data = read_result_split(folder_path)
#get biochem data
biochem_data = pd.read_csv('biolabel.csv')
bc_data = biochem_data['HBSAG_1']
# split train and test
x = np.array(cpg_data)
y = np.array(bc_data)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print('preprocessing down at', datetime.now().strftime('%H:%M:%S'))
# 成本函數
def costFunction(x_train, y_train, a, b):
predict = (a*x_train).sum(axis=1) + b
cost = ((y_train - predict)**2).mean()
return cost
# 梯度計算
def gradFunction(x_train,y_train,a,b):
predict = (a*x_train).sum(axis=1) + b
bGradient = (predict - y_train).mean() # b方向斜率
aGradient = np.zeros(x_train.shape[1])
for i in range(x_train.shape[1]):
aGradient[i] = (x_train[:, i]*(predict-y_train)).mean()
return aGradient, bGradient
# gradient descent 梯度下降
np.set_printoptions(formatter={'float':'{: .5e}'.format})
def gradientDescent(x, y, a, b, learning_rate, run_times, print_times):
for i in range(run_times+1):
aGradient, bGradient = gradFunction(x,y,a,b)
a = a - aGradient * learning_rate
b = b - bGradient * learning_rate
if i % print_times == 0:
print(f"I:{i:7}, cost:{costFunction(x,y,a,b):.2e}")
return a, b
# 執行
a = np.arange(1, 1001)
b = 100
run_times = 150000
learning_rate = 0.005
print_times = 5000
print(f'learning_rate={learning_rate}, runTimes={run_times}')
aFinal, bFinal = gradientDescent(x_train, y_train, a, b, learning_rate, run_times, print_times)
# 評分
predict = (aFinal*x_test).sum(axis=1) + bFinal
print(f'cost={costFunction(x_test, y_test, aFinal, bFinal)}')
print('ML done at', datetime.now().strftime('%H:%M:%S'))
"""
"""```
Please help me with this problem. Thanks a lot.
Upvotes: 0
Views: 47