glider-documentation/html/_identify_format_8py_source.html

# -*- coding: utf-8 -*-

import os

import boto3

import json

import chardet

# import smart_open

import ssl

from pandas import read_excel, ExcelFile

from pymongo import MongoClient

from dotenv import load_dotenv

from ReceivePath import receive_path

from ErrorHandler import *

from Excel_to_csv import pass_excel_to_csv

from MongoConnection import mongo_connection

import sys

import traceback

load_dotenv()


s3_file_obj = None


ENV = os.environ.get("ENVIRONMENT")

print(ENV)


ACCESS_ID, ACCESS_KEY = os.environ.get("AWS_KEY_ID"), os.environ.get("AWS_KEY_SECRET")


mongo_conn = mongo_connection()


collection = mongo_conn.mongo_conn_formats()


snap_collection = mongo_conn.mongo_conn_snapshots()


s3_client = boto3.client("s3", aws_access_key_id=ACCESS_ID, aws_secret_access_key= ACCESS_KEY)


def download_obj(event, file):

    """Receives file info and reads the firsts 1024 bytes to determinate the encoding


    Args:

        event (dict): is a dictionary with all client and sales information

        file (str): current filename

    Returns: s3_file (str)

             charenc (str)

    """

    bucket = event["bucket"][0]

    path = event["path"][0]

    obj = s3_client.get_object(Bucket=bucket, Key=path+"/"+file, Range='bytes=0-1024')

    s3_file = (obj['Body']._raw_stream).read()

    charenc = chardet.detect(s3_file).get('encoding')

    # s3_dir = f"s3://{bucket}/{path}/{file}"

    if (charenc == "ascii") or (charenc == "Windows-1252") or (charenc == "Johab"):# or (charenc == "ISO-8859-1"):

        charenc = "utf-8"

    return s3_file, charenc


def csvHeaders(event, file):

    """Takes the s3_file content and decoding it to get headers


    Args:

        event (dict): is a dictionary with all client and sales information

        file (str): current filename

    Returns: headers (list)

    """

    WINDOWS_LINE_ENDING = '\r\n'

    MAC_LINE_ENDING = '\r'

    UNIX_LINE_ENDING = '\n'

    global encoding

    s3_file, encoding = download_obj(event, file)

    headers = []

    try:

        s3_file = s3_file.decode(encoding)

        headers = s3_file.splitlines()

        # with smart_open.open(s3_dir, encoding=encoding) as f:

        #     headers = [next(f).strip("\n\r\r\n") for x in range(7)]

    except:

        print("File couldn't be loaded")

        # with smart_open.open(s3_dir, encoding=encoding) as f:

        #     headers = [next(f).strip("\n\r\r\n") for x in range(2)]

    if len(headers) > 1:

        return headers


def identifyHeaders(headers, collection):

    """Searches format using headers and takes template information. Also builds a list with main features of each column


    Args:

        headers (list): contains the X firsts lines from current file

        collection (mongo collection): Mongo collection where formats templates are storage

    Returns: template_format (dict)

    """


    skip_rows = 0

    # print(headers)

    for header in headers:

        template_format = collection.find_one({"header": header})

        if template_format:# and template_format[0]:

            print("FORMAT: {} \t VERSION: {} \t DELIMITER: {}".format(template_format["format"], template_format["version"], template_format["delimiters"]))

            print("HEADER IDENTIFIED: {}".format(header))

            ottoMapping_columns = cols_otto(json.loads(template_format["schema"]), json.loads(template_format["ottoMapping"]))

            return template_format["format"], template_format["delimiters"], ottoMapping_columns, skip_rows

        skip_rows+=1

    raise Exception("No format identified")


def type_schema(schema):

    """Takes the schema field from database and gets the main features of each column


    Args:

        schema (list): contains the columns features from current file

        collection (mongo collection): Mongo collection where formats templates are storage

    Returns: template_format (dict)

    """

    schema = schema["parameters"]

    values = dict()

    flagsNull = dict()

    mapper = {"StringType": "object", "IntegerType": "int64", "DoubleType": "float64", "DateType": "datetime64[us]"}

    for i in schema:

        values[i["structFieldTemplate"]["name"]] = mapper[i["structFieldTemplate"]["ftype"]]

        flagsNull[i["structFieldTemplate"]["name"]] = i["structFieldTemplate"]["flagNull"]

    return values, flagsNull


def cols_otto(schema, ottoMapping):

    """Builds the relation between main features of each column in the file and the mapping template


    Args:

        schema (list): contains the columns features from current file

        ottoMapping (list): contains the columns features from the desired template

    Returns: match (list)

    """

    otto_keys = ottoMapping['parameters']

    dtypes, flags = type_schema(schema)

    match = []

    for i in otto_keys:

        ottoMapping_row = i['mappingTemplate']

        # print(ottoMapping_row)

        if ottoMapping_row["ftype"] == "header":

            ottoMapping_row["value"] = ottoMapping_row["value"]

            ottoMapping_row["dtype"] = dtypes[ottoMapping_row["value"]]

            ottoMapping_row["flagNull"] = flags[ottoMapping_row["value"]]

        match.append(ottoMapping_row)

    return match


def identify_format(event, context=None):

    """Executes full procedure filter csv, txt and xls files


    Args:

        event (dict): is a dictionary with all client and sales information

        context (none): it's required just for lambda execution

    Returns: (dict)

    """

    global encoding

    event = receive_path(event, s3_client)

    print("Status={}".format(event["status"]))

    status = "OK"

    not_identified = dict()

    csv_formats = dict()

    for files in event["files"]:

        file = files["file"]

        file_db_id = files["file_id"]

        print(file)

        file_extension = os.path.splitext(file)[1]

        if "xls" in file_extension:

            try:

                s3_file_obj = s3_client.get_object(Bucket=event["bucket"][0], Key=event["path"][0]+'/'+file)

                xls = ExcelFile(s3_file_obj['Body'].read())

                pass_excel_to_csv(event, files, xls, s3_client)

            except Exception as e:

                print(e)

                m = "{}\n{}".format(sys.exc_info()[2], traceback.format_exc())

                not_identified[file_db_id] = {"file": file}

                eh  = ErrorHandler()

                error = eh.handle(e, m, file_db_id)

                status = error

                print("Maybe there is a sheet not identified")

                continue


        else:

            try:

                csv_headers = csvHeaders(event, file)

                formats, delimiter, otto_cols, skip_rows = identifyHeaders(csv_headers, collection)

                if formats not in csv_formats:

                    csv_formats[formats] = {"files":[{"file_id":file_db_id, "file": file, "delimiter": delimiter, "skip_rows": skip_rows, "encoding":encoding}], "columns": otto_cols}

                else:

                    csv_list = csv_formats[formats]["files"]

                    csv_list.append({"file_id":file_db_id, "file": file, "delimiter": delimiter, "skip_rows": skip_rows, "encoding":encoding})

            except Exception as e:

                print(e)

                m = "{}\n{}".format(sys.exc_info()[2], traceback.format_exc())

                # print(sys.exc_info()[2])

                # print(traceback.format_exc())

                eh  = ErrorHandler()

                error = eh.handle(e, m, file_db_id)

                not_identified[file_db_id] = {"file":file, "error":error}

                status = "OK"

                print("File {} not identified".format(file))

                continue

        snap_collection.update_one({"file_db_id":file_db_id},{"$set":{"status":"identified"}})

    event["status"] = "OK"

    event["format"] = csv_formats

    event["no_format_identified"] = not_identified

    return event

    # return {

    #         "status": "OK",

    #         "tag": event["tag"],

    #         "cat_gen": event["cat_gen"],

    #         "cat_match": event["cat_match"],

    #         "bucket": event["bucket"],

    #         "path": event["path"],

    #         "client_id": event["client_id"],

    #         "format": csv_formats,

    #         "no_format_identified": not_identified,

    #         "currency": event["currency"]

    #     }


src.importer.ErrorHandler.ErrorHandler
Definition ErrorHandler.py:9

src.importer.MongoConnection.mongo_connection
Definition MongoConnection.py:8

src.importer.IdentifyFormat.cols_otto
cols_otto(schema, ottoMapping)
Definition IdentifyFormat.py:122

src.importer.IdentifyFormat.identify_format
identify_format(event, context=None)
Definition IdentifyFormat.py:143

src.importer.IdentifyFormat.csvHeaders
csvHeaders(event, file)
Definition IdentifyFormat.py:58

src.importer.IdentifyFormat.identifyHeaders
identifyHeaders(headers, collection)
Definition IdentifyFormat.py:84

src.importer.IdentifyFormat.type_schema
type_schema(schema)
Definition IdentifyFormat.py:105

src.importer.IdentifyFormat.download_obj
download_obj(event, file)
Definition IdentifyFormat.py:39