From fd91de812d72dace05e714b4b32f61d65ad1909a Mon Sep 17 00:00:00 2001 From: Alexander Hess Date: Mon, 3 Sep 2018 16:12:15 +0200 Subject: [PATCH] Create a data folder for all static files --- 1_data_cleaning.ipynb | 8 ++++---- 2_pairwise_correlations.ipynb | 6 +++--- data_clean.csv => data/data_clean.csv | 0 .../data_clean_with_transformations.csv | 0 .../data_documentation.txt | 0 data_raw.xls => data/data_raw.xls | Bin .../weakly_and_strongly_correlated_variables.json | 0 utils.py | 6 +++--- 8 files changed, 10 insertions(+), 10 deletions(-) rename data_clean.csv => data/data_clean.csv (100%) rename data_clean_with_transformations.csv => data/data_clean_with_transformations.csv (100%) rename data_documentation.txt => data/data_documentation.txt (100%) rename data_raw.xls => data/data_raw.xls (100%) rename weakly_and_strongly_correlated_variables.json => data/weakly_and_strongly_correlated_variables.json (100%) diff --git a/1_data_cleaning.ipynb b/1_data_cleaning.ipynb index 5e7cf36..095d19b 100644 --- a/1_data_cleaning.ipynb +++ b/1_data_cleaning.ipynb @@ -30,7 +30,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2018-09-03 15:32:42 CEST\n", + "2018-09-03 16:10:26 CEST\n", "\n", "CPython 3.6.5\n", "IPython 6.5.0\n", @@ -174,13 +174,13 @@ "}\n", "\n", "try:\n", - " df = pd.read_excel(\"data_raw.xls\", **kwargs)\n", + " df = pd.read_excel(\"data/data_raw.xls\", **kwargs)\n", "except FileNotFoundError:\n", " df = pd.read_excel(\n", " \"https://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls\", **kwargs\n", " )\n", " # Cache the obtained file.\n", - " df.to_excel(\"data_raw.xls\")" + " df.to_excel(\"data/data_raw.xls\")" ] }, { @@ -3085,7 +3085,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_csv(\"data_clean.csv\")" + "df.to_csv(\"data/data_clean.csv\")" ] } ], diff --git a/2_pairwise_correlations.ipynb b/2_pairwise_correlations.ipynb index 75f281a..485c5ec 100644 --- a/2_pairwise_correlations.ipynb +++ b/2_pairwise_correlations.ipynb @@ -27,7 +27,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "2018-09-03 15:55:55 CEST\n", + "2018-09-03 16:11:06 CEST\n", "\n", "CPython 3.6.5\n", "IPython 6.5.0\n", @@ -2130,7 +2130,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"weakly_and_strongly_correlated_variables.json\", \"w\") as file:\n", + "with open(\"data/weakly_and_strongly_correlated_variables.json\", \"w\") as file:\n", " file.write(json.dumps({\n", " \"weakly_correlated\": sorted(\n", " list(pearson_weakly_correlated) + list(spearman_weakly_correlated)\n", @@ -3009,7 +3009,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.to_csv(\"data_clean_with_transformations.csv\")" + "df.to_csv(\"data/data_clean_with_transformations.csv\")" ] } ], diff --git a/data_clean.csv b/data/data_clean.csv similarity index 100% rename from data_clean.csv rename to data/data_clean.csv diff --git a/data_clean_with_transformations.csv b/data/data_clean_with_transformations.csv similarity index 100% rename from data_clean_with_transformations.csv rename to data/data_clean_with_transformations.csv diff --git a/data_documentation.txt b/data/data_documentation.txt similarity index 100% rename from data_documentation.txt rename to data/data_documentation.txt diff --git a/data_raw.xls b/data/data_raw.xls similarity index 100% rename from data_raw.xls rename to data/data_raw.xls diff --git a/weakly_and_strongly_correlated_variables.json b/data/weakly_and_strongly_correlated_variables.json similarity index 100% rename from weakly_and_strongly_correlated_variables.json rename to data/weakly_and_strongly_correlated_variables.json diff --git a/utils.py b/utils.py index 36752be..4f067e8 100644 --- a/utils.py +++ b/utils.py @@ -58,7 +58,7 @@ def _get_lines(): """Obtain the non-empty lines of the data description file.""" # Read cached data file. try: - with open("data_documentation.txt", "r") as file: + with open("data/data_documentation.txt", "r") as file: lines = file.readlines() # If there is no cached file, obtain in from the original source. except FileNotFoundError: @@ -67,7 +67,7 @@ def _get_lines(): "/jse/v19n3/decock/DataDocumentation.txt" ) # Cache the retrieved file. - with open("data_documentation.txt", "w") as file: + with open("data/data_documentation.txt", "w") as file: file.write(response.text) lines = response.text.split("\r\n") # Remove header, footer, and empty lines. @@ -318,7 +318,7 @@ def load_clean_data(subset=None, ordinal_encoded=False): """ # pragma pylint:disable=invalid-name df = pd.read_csv( - "data_clean.csv", + "data/data_clean.csv", index_col=INDEX_COLUMNS, dtype=object, na_values="", # There are no missing values in the clean data file.