!pip install awswrangler
!pip install pandas
!pip install sklearn
!pip install prov
!pip install seaborn
!pip install pydot
!pip install graphviz
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import awswrangler as wr
import seaborn as sns
import prov


landing_bucket = "s3://lab-landing-raw-sales-123/"
staging_bucket = "s3://lab-staging-clean-sales-123/"

file_name1 = "sales_fact_1997.slice-0-1.v0.0.1.csv"
filename2 = "sales_fact_1997.slice-1-2.v0.0.1.csv"
filename3 = "sales_fact_1997.slice-2-3.v0.0.1.csv"

# Import all the data from the s3 lake in different dataframes
df1 = pd.read_csv("http://big.csr.unibo.it/projects/nosql-datasets/" + file_name1)
df2 = pd.read_csv("http://big.csr.unibo.it/projects/nosql-datasets/" + filename2)
df3 = df = pd.read_csv("http://big.csr.unibo.it/projects/nosql-datasets/" + filename3)

# Merging or appending the data into one big file
first = df1.append(df2, ignore_index=True)
second = first.append(df3, ignore_index=True)
df_merged = second
# wr.s3.to_csv(df, path=input_bucket + file_name)


# look at the top 5
df_merged.head(5)


# dimension of the data
df_merged.shape

(86829, 18)


# TODO


# getting an insight about the data
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86829 entries, 0 to 86828
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   C. City        86829 non-null  object 
 1   C. Country     86829 non-null  object 
 2   C. State       86829 non-null  object 
 3   Customer       86829 non-null  int64  
 4   Yearly Income  86829 non-null  object 
 5   Category       86829 non-null  object 
 6   Subcategory    86829 non-null  object 
 7   Family         86829 non-null  object 
 8   Product        86829 non-null  object 
 9   S. City        86829 non-null  object 
 10  S. Country     86829 non-null  object 
 11  S. State       86829 non-null  object 
 12  Store          86829 non-null  int64  
 13  Type           86829 non-null  object 
 14  Sales Date     86829 non-null  object 
 15  Store Cost     86829 non-null  float64
 16  Store Sales    86829 non-null  float64
 17  Unit Sales     86829 non-null  int64  
dtypes: float64(2), int64(3), object(13)
memory usage: 11.9+ MB


# Run a simple statistics on the data
df_merged.describe().transpose()


# Checking for missing values
df_merged.isnull()


#sum total missing values if any
df_merged.isna().sum()

C. City          0
C. Country       0
C. State         0
Customer         0
Yearly Income    0
Category         0
Subcategory      0
Family           0
Product          0
S. City          0
S. Country       0
S. State         0
Store            0
Type             0
Sales Date       0
Store Cost       0
Store Sales      0
Unit Sales       0
dtype: int64


# columns in the dataset
df_merged.columns

Index(['C. City', 'C. Country', 'C. State', 'Customer', 'Yearly Income',
       'Category', 'Subcategory', 'Family', 'Product', 'S. City', 'S. Country',
       'S. State', 'Store', 'Type', 'Sales Date', 'Store Cost', 'Store Sales',
       'Unit Sales'],
      dtype='object')


# total sale units
df_merged[['Unit Sales']].sum()

Unit Sales    266743
dtype: int64


# number of unique products
df_merged[["Product"]].nunique()

Product    1559
dtype: int64


df_merged.nunique()

C. City             78
C. Country           1
C. State             3
Customer          5581
Yearly Income        8
Category            45
Subcategory        102
Family               3
Product           1559
S. City             13
S. Country           1
S. State             3
Store               13
Type                 5
Sales Date         323
Store Cost       10776
Store Sales       1049
Unit Sales           6
dtype: int64


# importing seaborn and visualization correlation in the data
import seaborn as sns
sns.heatmap(df_merged.corr())

<AxesSubplot:>


# visualizing pairplot
sns.pairplot(df_merged)

<seaborn.axisgrid.PairGrid at 0x7ff32ce8fdf0>


# finally write the data back to s3
wr.s3.to_csv(df=df_merged, path= staging_bucket + "df_merged_clean" , index=False)

{'paths': ['s3://lab-staging-clean-sales-123/df_merged_clean'],
 'partitions_values': {}}


# write database catalog
databases = wr.catalog.databases()
databases


# write the tables
df_tables = wr.catalog.tables()
df_tables


# Read the data from the aws s3 staginb bucket area
raw_data = wr.s3.read_csv(staging_bucket + "df_merged_clean")
raw_data


host = "labbigdata.cbj7jwrd78zb.us-east-1.rds.amazonaws.com"
port = 5432
user = "postgres"
pwd = "Bigdata2022"
db = "postgres"

from sqlalchemy import create_engine
s = 'postgresql://{}:{}@{}:{}/{}'.format(user, pwd, host, str(port), db)
engine = create_engine(s)

# TODO


# DT_customer
df[['C. City', 'C. Country', 'C. State', 'Customer', 'Yearly Income']]\
  .drop_duplicates()\
  .to_sql('customer', engine, index=False, if_exists="replace")


# DT_store
df[['S. City', 'S. Country','S. State', 'Store', 'Type']]\
  .drop_duplicates()\
  .to_sql('store', engine, index=False, if_exists="replace")


# DT_product
df[['Category', 'Subcategory', 'Family', 'Product']]\
  .drop_duplicates()\
  .to_sql('product', engine, index=False, if_exists="replace")


# DT_date
df[['Sales Date']]\
  .drop_duplicates()\
  .to_sql('date', engine, index=False, if_exists="replace")


# FT
df[['Customer', 'Store', 'Product', 'Sales Date', 'Store Sales', 'Store Cost','Unit Sales']]\
  .drop_duplicates()\
  .to_sql('sales', engine, index=False, if_exists="replace")


from prov.model import ProvDocument
from prov.dot import prov_to_dot
from IPython.display import Image

def plot():
    # Create a new provenance document
    d1 = ProvDocument()  # d1 is now an empty provenance document
    
    d1.add_namespace('s3', 'https://s3.console.aws.amazon.com/s3')
    d1.add_namespace('rds', 'https://us-east-1.console.aws.amazon.com/rds')
    d1.add_namespace('unibo', 'https://www.unibo.it')
    d1.add_namespace('pd', 'https://pandas.pydata.org/')
    
    # TODO

    # visualize the graph
    dot = prov_to_dot(d1)
    dot.write_png('prov.png')
    return Image('prov.png')


plot()

	C. City	C. Country	C. State	Customer	Yearly Income	Category	Subcategory	Family	Product	S. City	S. Country	S. State	Store	Type	Sales Date	Store Cost	Store Sales	Unit Sales
0	Albany	USA	OR	117	$90K - $110K	Baking Goods	Cooking Oil	Food	Super Corn Oil	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	2.1177	5.43	3
1	Albany	USA	OR	117	$90K - $110K	Baking Goods	Sauces	Food	Landslide Tomato Sauce	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	1.6500	5.00	4
2	Albany	USA	OR	117	$90K - $110K	Baking Goods	Sugar	Food	Landslide Brown Sugar	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	5.3760	11.20	4
3	Albany	USA	OR	117	$90K - $110K	Bathroom Products	Conditioner	Non-Consumable	Consolidated Silky Smooth Hair Conditioner	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	1.1466	2.94	2
4	Albany	USA	OR	117	$90K - $110K	Beer and Wine	Wine	Drink	Pearl Merlot Wine	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	0.9288	2.16	4

	count	mean	std	min	25%	50%	75%	max
Customer	86829.0	5137.738451	2906.854365	3.000	2632.000	5092.0000	7639.0000	10277.000
Store	86829.0	13.165256	6.207908	2.000	7.000	13.0000	17.0000	24.000
Store Cost	86829.0	2.598186	1.447287	0.153	1.479	2.3684	3.4716	9.875
Store Sales	86829.0	6.508941	3.465830	0.500	3.780	5.9200	8.6400	23.640
Unit Sales	86829.0	3.072050	0.847164	1.000	3.000	3.0000	4.0000	6.000

	C. City	C. Country	C. State	Customer	Yearly Income	Category	Subcategory	Family	Product	S. City	S. Country	S. State	Store	Type	Sales Date	Store Cost	Store Sales	Unit Sales
0	Albany	USA	OR	117	$90K - $110K	Baking Goods	Cooking Oil	Food	Super Corn Oil	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	2.1177	5.43	3
1	Albany	USA	OR	117	$90K - $110K	Baking Goods	Sauces	Food	Landslide Tomato Sauce	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	1.6500	5.00	4
2	Albany	USA	OR	117	$90K - $110K	Baking Goods	Sugar	Food	Landslide Brown Sugar	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	5.3760	11.20	4
3	Albany	USA	OR	117	$90K - $110K	Bathroom Products	Conditioner	Non-Consumable	Consolidated Silky Smooth Hair Conditioner	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	1.1466	2.94	2
4	Albany	USA	OR	117	$90K - $110K	Beer and Wine	Wine	Drink	Pearl Merlot Wine	Salem	USA	OR	13	Deluxe Supermarket	1/21/1997 12:00:00 AM	0.9288	2.16	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
86824	Yakima	USA	WA	10226	$10K - $30K	Jams and Jellies	Peanut Butter	Food	Plato Chunky Peanut Butter	Yakima	USA	WA	23	Mid-Size Grocery	6/30/1997 12:00:00 AM	1.9600	4.90	2
86825	Yakima	USA	WA	10226	$10K - $30K	Meat	Hot Dogs	Food	Cutting Edge Chicken Hot Dogs	Yakima	USA	WA	23	Mid-Size Grocery	6/30/1997 12:00:00 AM	1.4229	4.59	3
86826	Yakima	USA	WA	10226	$10K - $30K	Pizza	Pizza	Food	PigTail Frozen Pepperoni Pizza	Yakima	USA	WA	23	Mid-Size Grocery	3/1/1997 12:00:00 AM	1.0752	3.36	2
86827	Yakima	USA	WA	10226	$10K - $30K	Vegetables	Fresh Vegetables	Food	Ebony Garlic	Yakima	USA	WA	23	Mid-Size Grocery	2/11/1997 12:00:00 AM	4.3680	10.40	4
86828	Yakima	USA	WA	10226	$10K - $30K	Vegetables	Fresh Vegetables	Food	Hermanos Green Pepper	Yakima	USA	WA	23	Mid-Size Grocery	2/11/1997 12:00:00 AM	6.0216	15.44	4

DL2DWH Lab¶

Data ingestion¶

Data understanding¶

Which functional dependencies hold among the levels¶

Design and build a relational DWH¶

Connect tableau and aws¶

Doing some ETL , modelling dependencies and populating the database on Aws RDS¶

Modelling the hierachies in tableau¶

Writing and populating the tables in the databases¶

Query the DWH with Tableau (Desktop)¶

Exercise 1¶

Exercise 2¶

Exercise 3¶

(Bonus) Plot the provenance graph¶

	C. City	C. Country	C. State	Customer	Yearly Income	Category	Subcategory	Family	Product	S. City	S. Country	S. State	Store	Type	Sales Date	Store Cost	Store Sales	Unit Sales
0	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
86824	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
86825	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
86826	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
86827	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
86828	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False