Glider
Loading...
Searching...
No Matches
FacebookTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3from pandas import to_datetime
4
6 def preprocessing(self, filename, features, session, rel_col):
7 """Loads file and do some fixes to fit it to our standard and make it able to be processed
8
9 Args:
10 filename (str): s3 full path of filename
11 features (dict): contains delimiter, skip_rows and encoding required for current file
12 session (boto3 obj): AWS client connection
13 rel_col (str): column name used to identify upc/release_id
14 Returns: df (pandas dataframe)
15 """
16 delimiter = features["delimiter"]
17 skip_rows = features["skip_rows"]
18 encoding = features["encoding"]
19 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, low_memory=False, boto3_session=session)
20 df["event_count"].fillna(0, inplace=True)
21 df["event_count"] = df["event_count"].astype("int")
22 df["start_date"] = to_datetime(df["start_date"], format='%Y/%d/%m')
23 # df["Units"].fillna(0, inplace=True)
24 return df
25
26 def date(self, filename):
27 """Sets date column given the filename (it contains the date)
28 For example GYROstream_Pty_Ltd_Facebook-AL-Production_Usage-Report_202203.csv
29 Args:
30 filename (str): current filename
31 Returns: date_str (str)
32 """
33 # GYROstream_Pty_Ltd_Facebook-AL-Production_Usage-Report_202203.csv
34 date = re.findall(r'Report_\d{6}', filename)[0]
35 date = date.replace("Report_", "")
36 date_str = date[:4]+"-"+date[4:]+"-01"
37 # df["date_from_file"] = date_str
38 return date_str
preprocessing(self, filename, features, session, rel_col)