Glider
Loading...
Searching...
No Matches
DeezerTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3from pandas import to_datetime
4
6 def preprocessing(self, filename, features, session, rel_col):
7 """
8 args:
9 filename: -str- s3 full path of filename
10 features: -dict- which contains delimiter, skip_rows and encoding
11 session: -class- s3 connexion
12 """
13 delimiter = features["delimiter"]
14 skip_rows = features["skip_rows"]
15 encoding = features["encoding"]
16 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, low_memory=False, boto3_session=session)
17 df["Start Report"] = to_datetime(df["Start Report"], format="%d-%m-%Y")
18 # df = read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, low_memory=False)#, keep_default_na=False, low_memory=False)
19 return df
20
21 def date(self, filename):
22 """Sets date column given the filename (it contains the date)
23 For example Deezer_GYROstreamMERLIN_20220201_20220228.txt
24 Args:
25 filename (str): current filename
26 Returns: date_str (str)
27 """
28 # "Deezer_GYROstreamMERLIN_20220201_20220228.txt"
29 date = re.findall(r'_\d{8}_', filename)[0]
30 date = date.replace("_", "")
31 date_str = date[:4]+"-"+date[4:6]+"-"+date[6:]
32 # df["date_from_file"] = date_str
33 return date_str
preprocessing(self, filename, features, session, rel_col)