Glider
Loading...
Searching...
No Matches
MixcloudTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3from pandas import to_datetime
4
6 def preprocessing(self, filename, features, session, rel_col):
7 """Loads file and do some fixes to fit it to our standard and make it able to be processed
8
9 Args:
10 filename (str): s3 full path of filename
11 features (dict): contains delimiter, skip_rows and encoding required for current file
12 session (boto3 obj): AWS client connection
13 rel_col (str): column name used to identify upc/release_id
14 Returns: df (pandas dataframe)
15 """
16 delimiter = features["delimiter"]
17 skip_rows = features["skip_rows"]
18 encoding = features["encoding"]
19 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, keep_default_na=False, low_memory=False, boto3_session=session)
20 # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, keep_default_na=False, low_memory=False)
21 df["Start_Date"] = to_datetime(df["Start_Date"], format="%d/%m/%Y")
22 # df["Units"].fillna(0, inplace=True)
23 return df
preprocessing(self, filename, features, session, rel_col)