Glider
Loading...
Searching...
No Matches
Preprocessing.py
Go to the documentation of this file.
1import awswrangler as wr
2
4 def preprocessing(self, filename, features, session, rel_col):
5 """Loads file and do some fixes to fit it to our standard and make it able to be processed
6
7 Args:
8 filename (str): s3 full path of filename
9 features (dict): contains delimiter, skip_rows and encoding required for current file
10 session (boto3 obj): AWS client connection
11 rel_col (str): column name used to identify upc/release_id
12 Returns: df (pandas dataframe)
13 """
14 delimiter = features["delimiter"]
15 skip_rows = features["skip_rows"]
16 encoding = features["encoding"]
17 try:
18 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, low_memory=False, boto3_session=session)
19 except:
20 encoding = "utf-8"
21 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, low_memory=False, boto3_session=session)
22 # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, low_memory=False)#, keep_default_na=False, low_memory=False)
23 return df
preprocessing(self, filename, features, session, rel_col)