datalad-handbook/docs/basics/_examples/DL-101-130-110

$ git show ready4analysis
commit 0915b050✂SHA1
Author: Elena Piscopia <elena@example.net>
Date:   Tue Jun 18 16:13:00 2019 +0000

    add script for kNN classification and plotting

diff --git a/code/script.py b/code/script.py
new file mode 100644
index 0000000..2d9bf3a
--- /dev/null
+++ b/code/script.py
@@ -0,0 +1,46 @@
+
+import argparse
+import matplotlib
+matplotlib.use('Agg')
+
+import pandas as pd
+import seaborn as sns
+from sklearn import model_selection
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import classification_report
+
+parser = argparse.ArgumentParser(description="Analyze iris data")
+parser.add_argument('data', help="Input data (CSV) to process")
+parser.add_argument('output_figure', help="Output figure path")
+parser.add_argument('output_report', help="Output report path")
+args = parser.parse_args()
+
+# prepare the data as a pandas dataframe
+df = pd.read_csv(args.data)
+attributes = ["sepal_length", "sepal_width", "petal_length","petal_width", "class"]
+df.columns = attributes
+
+# create a pairplot to plot pairwise relationships in the dataset
+plot = sns.pairplot(df, hue='class', palette='muted')
+plot.savefig(args.output_figure)
+
+# perform a K-nearest-neighbours classification with scikit-learn
+# Step 1: split data in test and training dataset (20:80)
+array = df.values
+X = array[:,0:4]
+Y = array[:,4]
+test_size = 0.20
+seed = 7
+X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
+    X, Y,
+    test_size=test_size,
+    random_state=seed)
+# Step 2: Fit the model and make predictions on the test dataset
+knn = KNeighborsClassifier()
+knn.fit(X_train, Y_train)
+predictions = knn.predict(X_test)
+
+# Step 3: Save the classification report
+report = classification_report(Y_test, predictions, output_dict=True)
+df_report = pd.DataFrame(report).transpose().to_csv(args.output_report)
+