Glider
Loading...
Searching...
No Matches
YoutubeTemplate.py
Go to the documentation of this file.
1import re
2import awswrangler as wr
3from pandas import to_datetime
4
6 def preprocessing(self, filename, features, session, rel_col):
7 """Loads file and do some fixes to fit it to our standard and make it able to be processed
8
9 Args:
10 filename (str): s3 full path of filename
11 features (dict): contains delimiter, skip_rows and encoding required for current file
12 session (boto3 obj): AWS client connection
13 rel_col (str): column name used to identify upc/release_id
14 Returns: df (pandas dataframe)
15 """
16 delimiter = features["delimiter"]
17 skip_rows = features["skip_rows"]
18 encoding = features["encoding"]
19 df = wr.s3.read_csv(filename, sep=delimiter, skiprows=skip_rows, encoding=encoding, dtype={rel_col:str}, keep_default_na=False, low_memory=False, boto3_session=session)
20 try:
21 df["Month"] = to_datetime(df["Month"], format='%Y%m')
22 except:
23 df["Day"] = to_datetime(df["Day"], format='%Y%m%d')
24 df["gettypeyt"] = df.apply(lambda Row: self.type(Row['Asset Type']), axis=1)
25 df["Asset Type"] = df["gettypeyt"]
26 df.drop("gettypeyt", axis=1, inplace=True)
27 return df
28
29 def type(self, asset_type):
30 """Maps the sale type with the name specified by the client according their requirements
31 Args:
32 asset_type (str): sale type shown in the original file
33 Returns: (sale type in a single one letter)
34 """
35 if "Sound Recording" in asset_type:
36 return 'Youtube Content ID'
37 elif "Art Track" in asset_type or "Music Video" in asset_type:
38 return 'Youtube Music'
39 # return 'Youtube'
40
41 def date(self, filename):
42 """Sets date column given the filename (it contains the date)
43 For example YouTube_GSDistroPartners_M_20220801_asset_raw_v1-1.csv
44 Args:
45 filename (str): current filename
46 Returns: date_str (str)
47 """
48 # "YouTube_GSDistroPartners_M_20220801_asset_raw_v1-1.csv"
49 date = re.findall(r'M_20\d{6}', filename)[0]
50 date = date.replace("M_", "")
51 date_str = date[:4]+"-"+date[4:6]
52 # df["date_from_file"] = date_str
53 return date_str
preprocessing(self, filename, features, session, rel_col)