Reputation: 564
I am using Pytest to mock a function whose return value is defined in another function.
This is my original function which reads all the files from Azure Data Lake that I would like to mock
class DataLakeEngine:
def read_folder_as_pandas_dataframe(
self,
folder_path_remote: str,
file_type: str = "parquet",
options: Dict = dict(),
) -> pd.DataFrame:
"""
Download all files from the data lake in the data area and concatenate them into a single pandas data frame.
Args:
folder_path_remote (str): full path to the folder in the data area
file_type (str): file type, possible values are 'csv', 'xlsx', 'json', 'pkl' and 'parquet'
options (Dict): additional parameters used for converting file content to data frame
Returns:
(pd.DataFrame): concatenated
"""
df = pd.DataFrame()
paths_in_folder = self.file_system_client.get_paths(folder_path_remote)
for path_in_folder in tqdm(paths_in_folder):
if path_in_folder.is_directory or "delta_log" in path_in_folder.name:
continue
current_file = path_in_folder.name
temp_df = self.read_file_as_pandas_dataframe(
current_file, file_type, options
)
df = pd.concat([df, temp_df]).reset_index(drop=True)
return df
and my logic function which I want to test looks like this
class Preprocessor:
def method1(self):
data = DataLakeEngine.read_folder_as_pandas_dataframe(
self.ticket_data_path, "parquet"
)
return data
My test class looks like this where I read from my local file system instead of reading from Data Lake.
import unittest
import pytest
from preprocessing import PreprocessingPipeline
from unittest.mock import patch
import pandas as pd
import os
from tqdm.autonotebook import tqdm
from typing import Dict
from datalake import DatalakeEngine
class TestPreprocessing(unittest.TestCase):
def mock_read_folder_as_pandas_dataframe(
self, folder_path: str, file_type: str, options: Dict = dict()
) -> pd.DataFrame:
"""
Read all files in a folder and return them as a pandas dataframe
Args:
folder_path (str): path to the folder
file_type (str): type of file to read
options (dict): options to pass to the read
Returns:
pd.DataFrame: dataframe with all the files
"""
file_paths = []
for root, dirs, files in os.walk(folder_path):
for file in files:
if file.endswith(file_type):
file_paths.append(os.path.join(root, file))
df = pd.DataFrame()
for file_path in tqdm(file_paths):
if file_type == "parquet":
temp_df = pd.read_parquet(file_path, **options)
else:
temp_df = pd.read_excel(file_path, **options)
df = pd.concat([df, temp_df]).reset_index(drop=True)
return df
@pytest.fixture(autouse=True)
def data1(self) -> pd.DataFrame:
with patch.object(
DatalakeEngine,
"read_folder_as_pandas_dataframe",
side_effect=self.mock_read_folder_as_pandas_dataframe(
"folder1", "parquet"
),
) as mock_func:
data1 = Preprocessor.method1()
return data1
def test_logic(
self, data1: pd.DataFrame
):
transformed_df = Preprocessor.preprocess_data(
data1
)
assert transformed_df.shape[0] > 0
However, when I run the below test, the mock function returns a string (the name of the first column of my data) instead of pandas dataframe at the read_folder_as_pandas_dataframe call inside the method1() function.
How can I change my patch here to return a data frame here?
Upvotes: 0
Views: 35