How to deal with the ArrayMemoryError problem? Unable to allocate memory for an array?

Question

I am trying to process data but i constantly run on this Error:

Traceback (most recent call last):
  File "d:\CSlab\biolabelandmethylevelmml.py", line 60, in 
    cpg_data = read_result_split(folder_path)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\CSlab\biolabelandmethylevelmml.py", line 52, in read_result_split
    data_numpy = np.concatenate([data[:, :max_cols] for data in rawdata], axis=0)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 13.5 GiB for an array with shape (2088, 865919) and data type float64

Here is my code:

"""
基於'不健康者'的'CpG位點甲基化'&'生化指數'
建立'血液疾病'預測模型
"""
import pandas as pd
import os
import gc
import numpy as np
from tqdm import tqdm
from datetime import datetime
from sklearn.model_selection import train_test_split
#start
print('start at', datetime.now().strftime('%H:%M:%S'))

#get cpg data
folder_path = 'D:\CSlab\cs data\result_split'
def read_result_split(folder_path):
    # read file
    print("Start read result split at: ", datetime.now().time())
    file_names = os.listdir(folder_path)
    with open(os.path.join(folder_path, os.listdir(folder_path)[0]), 'r',encoding="utf-8") as f:
        ncols = len(f.readline().split(','))
    # init
    rawdata = []
    for i in tqdm(range(len(file_names)), miniters=20):
        file_name = file_names[i]
        file_path = os.path.join(folder_path, file_name)
        # get numbers of max columns
        with open(file_path, 'r') as f:
            ncols = len(f.readline().split(','))
        # read csv
        try:
            df = pd.read_csv(file_path, header=None, usecols=list(range(1, ncols)), low_memory=False, on_bad_lines='skip')
            df = df.fillna(0)
        except Exception as e:
            print(f"Cannot read file {file_path}, Error Message:{str(e)}")
            continue

        # clear
        gc.collect()
        # save to data
        min_cols = min(ncols, len(df.columns))
        df = df.iloc[:, :min_cols] 
        df.columns = ['Age'] + [str(i) for i in range(min_cols-1)]
        rawdata.append(df.values)


    print("Read file done at: ", datetime.now().time())

    # list to numpy
    max_cols = min([data.shape[1] for data in rawdata])
    data_numpy = np.concatenate([data[:, :max_cols] for data in rawdata], axis=0)

    # structuralize data
    dtype = [('Age', 'i4')] + [(str(i), 'f8') for i in range(ncols)]
    structured_data = np.core.records.fromarrays(data_numpy.transpose(), dtype=dtype)
    
    return structured_data

cpg_data = read_result_split(folder_path)

#get biochem data
biochem_data = pd.read_csv('biolabel.csv')
bc_data = biochem_data['HBSAG_1']

# split train and test
x = np.array(cpg_data)
y = np.array(bc_data)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

print('preprocessing down at', datetime.now().strftime('%H:%M:%S'))

# 成本函數
def costFunction(x_train, y_train, a, b):
    predict = (a*x_train).sum(axis=1) + b
    cost = ((y_train - predict)**2).mean()
    return cost

# 梯度計算
def gradFunction(x_train,y_train,a,b):
    predict = (a*x_train).sum(axis=1) + b
    bGradient = (predict - y_train).mean() # b方向斜率
    aGradient = np.zeros(x_train.shape[1])
    for i in range(x_train.shape[1]):
        aGradient[i] = (x_train[:, i]*(predict-y_train)).mean()
    return aGradient, bGradient

# gradient descent 梯度下降
np.set_printoptions(formatter={'float':'{: .5e}'.format})
def gradientDescent(x, y, a, b, learning_rate, run_times, print_times):
    for i in range(run_times+1):
      aGradient, bGradient = gradFunction(x,y,a,b)
      a = a - aGradient * learning_rate
      b = b - bGradient * learning_rate
      if i % print_times == 0:
        print(f"I:{i:7}, cost:{costFunction(x,y,a,b):.2e}")
    return a, b

# 執行
a = np.arange(1, 1001)
b = 100
run_times = 150000
learning_rate = 0.005
print_times = 5000

print(f'learning_rate={learning_rate}, runTimes={run_times}')
aFinal, bFinal = gradientDescent(x_train, y_train, a, b, learning_rate, run_times, print_times)

# 評分
predict = (aFinal*x_test).sum(axis=1) + bFinal
print(f'cost={costFunction(x_test, y_test, aFinal, bFinal)}')
print('ML done at', datetime.now().strftime('%H:%M:%S'))

"""
"""```


Please help me with this problem. Thanks a lot.

How to deal with the ArrayMemoryError problem? Unable to allocate memory for an array?

Answers (0)

Related Questions