Glider
Loading...
Searching...
No Matches
SpotifyTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3
5 def preprocessing(self, filename, features, session, rel_col):
6 """Loads file and do some fixes to fit it to our standard and make it able to be processed
7
8 Args:
9 filename (str): s3 full path of filename
10 features (dict): contains delimiter, skip_rows and encoding required for current file
11 session (boto3 obj): AWS client connection
12 rel_col (str): column name used to identify upc/release_id
13 Returns: df (pandas dataframe)
14 """
15 delimiter = features["delimiter"]
16 skip_rows = features["skip_rows"]
17 encoding = features["encoding"]
18 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, boto3_session=session)
19 try:
20 df["discovery_mode_fee"] = df["discovery_mode_fee"]*(-1)
21 except:
22 df["Discovery Mode Fee"] = df["Discovery Mode Fee"]*(-1)
23 return df
24
25 def date(self, filename):
26 """Sets date column given the filename (it contains the date)
27 For example SPOTIFY_gyrostream-track-for-breakage-202307.txt
28 Args:
29 filename (str): current filename
30 Returns: date_str (str)
31 """
32 date = re.findall(r'-20\d{4}', filename)[0]
33 date = date.replace("-", "")
34 year = str(date[:4])
35 month = str(date[4:])
36 date_str = year+"-"+month+"-01"
37 # df["date_from_file"] = date_str
38 return date_str
39
40 def dateTrends(self, filename):
41 """Sets date column given the filename (it contains the date)
42 Args:
43 filename (str): current filename
44 Returns: date (str)
45 """
46 date = re.findall(r"\d{4}-\d{2}-\d{2}", filename)
47 date = str(date[0])
48 date = date.replace("streams-", "")
49 # df["date_from_file"] = date
50 return date
51
52 def territoryTrends(self, df, filename):
53 """Sets territory column given the filename (it contains the date)
54 Args:
55 df (pandas dataframe): dataframe where changes will applied
56 filename (str): current filename
57 Returns: df (pandas dataframe)
58 """
59 territory = re.findall(r"\w{2,3}\.", filename)[0]
60 territory = territory.replace(".", "")
61 df["territory_from_file"] = territory
62 return df
preprocessing(self, filename, features, session, rel_col)