392 lines
13 KiB
Text
392 lines
13 KiB
Text
run 'datalad'
|
|
run 'ipython'
|
|
run 'import datalad.api as dl'
|
|
run 'exit'
|
|
run 'datalad --version'
|
|
run 'git config --global --add user.name "Adina Wagner"
|
|
git config --global --add user.email "adina.wagner@t-online.de"'
|
|
say 'VERSION CONTROL'
|
|
run 'datalad create -c text2git DataLad-101'
|
|
run 'cd DataLad-101
|
|
ls # ls does not show any output, because the dataset is empty.'
|
|
run 'git log'
|
|
run 'mkdir books'
|
|
run 'cd books && wget -nv https://sourceforge.net/projects/linuxcommand/files/TLCL/19.01/TLCL-19.01.pdf/download -O TLCL.pdf && wget -nv https://edisciplinas.usp.br/pluginfile.php/3252353/mod_resource/content/1/b_Swaroop_Byte_of_python.pdf -O byte-of-python.pdf && cd ../'
|
|
run 'tree'
|
|
run 'datalad status'
|
|
run 'datalad save -m "add books on Python and Unix to read later"'
|
|
run 'git log -p -n 1'
|
|
run 'cd books && wget -nv https://github.com/progit/progit2/releases/download/2.1.154/progit.pdf && cd ../'
|
|
run 'datalad status'
|
|
run 'datalad save -m "add reference book about git" books/progit.pdf'
|
|
run 'cat << EOT > notes.txt
|
|
One can create a new dataset with '"'"'datalad create PATH'"'"'.
|
|
The dataset is created empty
|
|
|
|
|
|
EOT'
|
|
run 'datalad status'
|
|
run 'datalad save -m "Add notes on datalad create"'
|
|
run 'cat << EOT >> notes.txt
|
|
The command "datalad save [-m] PATH" saves the file
|
|
(modifications) to history. Note to self:
|
|
Always use informative, concise commit messages.
|
|
|
|
EOT'
|
|
run 'datalad status'
|
|
run 'datalad save -m "add note on datalad save"'
|
|
run 'git log -p -n 2'
|
|
say 'CONSUMPTION'
|
|
run 'mkdir recordings'
|
|
run 'datalad clone --dataset . \
|
|
https://github.com/datalad-datasets/longnow-podcasts.git recordings/longnow'
|
|
run 'tree -d # we limit the output to directories'
|
|
run 'cd recordings/longnow/Long_Now__Seminars_About_Long_term_Thinking
|
|
ls'
|
|
run 'vlc Long_Now__Seminars_About_Long_term_Thinking/2003_11_15__Brian_Eno__The_Long_Now.mp3'
|
|
run 'cd ../ # in longnow/
|
|
du -sh # Unix command to show size of contents'
|
|
run 'datalad status --annex'
|
|
run 'datalad get Long_Now__Seminars_About_Long_term_Thinking/2003_11_15__Brian_Eno__The_Long_Now.mp3'
|
|
run 'datalad get Long_Now__Seminars_About_Long_term_Thinking/2003_11_15__Brian_Eno__The_Long_Now.mp3 \
|
|
Long_Now__Seminars_About_Long_term_Thinking/2003_12_13__Peter_Schwartz__The_Art_Of_The_Really_Long_View.mp3 \
|
|
Long_Now__Seminars_About_Long_term_Thinking/2004_01_10__George_Dyson__There_s_Plenty_of_Room_at_the_Top__Long_term_Thinking_About_Large_scale_Computing.mp3'
|
|
run 'datalad drop Long_Now__Seminars_About_Long_term_Thinking/2003_12_13__Peter_Schwartz__The_Art_Of_The_Really_Long_View.mp3'
|
|
run 'datalad get Long_Now__Seminars_About_Long_term_Thinking/2003_11_15__Brian_Eno__The_Long_Now.mp3'
|
|
say 'NESTING'
|
|
run 'git log --reverse'
|
|
run 'cd ../../'
|
|
run 'git log -p -n 1'
|
|
run 'cd recordings/longnow
|
|
git log --oneline'
|
|
run 'cd ../../'
|
|
run 'cd ../'
|
|
run 'datalad clone git@github.com:psychoinformatics-de/paper-remodnav.git'
|
|
run 'cd paper-remodnav'
|
|
run 'datalad subdatasets'
|
|
run 'cd remodnav
|
|
ls'
|
|
run 'datalad get --recursive --recursion-limit 2 -n .
|
|
ls'
|
|
run 'datalad subdatasets'
|
|
run 'cd remodnav/tests/data/anderson_etal
|
|
git log -n 3'
|
|
|
|
say 'REPRODUCIBLE ANALYSIS'
|
|
run 'cd /demo'
|
|
run 'datalad create -c yoda myanalysis'
|
|
run 'cd myanalysis
|
|
tree'
|
|
run 'datalad clone -d . git@github.com:datalad-handbook/iris_data.git input/'
|
|
|
|
run 'cat << EOT > code/script.py
|
|
|
|
import pandas as pd
|
|
import seaborn as sns
|
|
import datalad.api as dl
|
|
from sklearn import model_selection
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.metrics import classification_report
|
|
|
|
data = "input/iris.csv"
|
|
|
|
# make sure that the data are obtained (get will also install linked sub-ds!):
|
|
dl.get(data)
|
|
|
|
# prepare the data as a pandas dataframe
|
|
df = pd.read_csv(data)
|
|
attributes = ["sepal_length", "sepal_width", "petal_length","petal_width", "class"]
|
|
df.columns = attributes
|
|
|
|
# create a pairplot to plot pairwise relationships in the dataset
|
|
plot = sns.pairplot(df, hue='"'"'class'"'"', palette='"'"'muted'"'"')
|
|
plot.savefig('"'"'pairwise_relationships.png'"'"')
|
|
|
|
# perform a K-nearest-neighbours classification with scikit-learn
|
|
# Step 1: split data in test and training dataset (20:80)
|
|
array = df.values
|
|
X = array[:,0:4]
|
|
Y = array[:,4]
|
|
test_size = 0.20
|
|
seed = 7
|
|
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y,
|
|
test_size=test_size,
|
|
random_state=seed)
|
|
# Step 2: Fit the model and make predictions on the test dataset
|
|
knn = KNeighborsClassifier()
|
|
knn.fit(X_train, Y_train)
|
|
predictions = knn.predict(X_test)
|
|
|
|
# Step 3: Save the classification report
|
|
report = classification_report(Y_test, predictions, output_dict=True)
|
|
df_report = pd.DataFrame(report).transpose().to_csv('"'"'prediction_report.csv'"'"')
|
|
|
|
EOT'
|
|
run 'datalad status'
|
|
run 'datalad save -m "add script for kNN classification and plotting" --version-tag ready4analysis code/script.py'
|
|
run 'datalad status'
|
|
run 'datalad run -m "analyze iris data with classification analysis" \
|
|
--input "input/iris.csv" \
|
|
--output "prediction_report.csv" \
|
|
--output "pairwise_relationships.png" \
|
|
"python3 code/script.py"'
|
|
run 'git log -n 1'
|
|
|
|
run 'cd /demo
|
|
datalad clone git@github.com:adswa/myanalysis.git analysis_clone'
|
|
run 'cd analysis_clone'
|
|
run 'datalad rerun 71cb8c5'
|
|
run 'git log pairwise_relationships.png'
|
|
|
|
say 'COMPUTATIONAL REPRODUCIBILITY'
|
|
run 'cd ../myanalysis'
|
|
run 'datalad containers-add software --url shub://adswa/resources:1'
|
|
run 'datalad containers-run -m "rerun analysis in container" \
|
|
--container-name software \
|
|
--input "input/iris.csv" \
|
|
--output "prediction_report.csv" \
|
|
--output "pairwise_relationships.png" \
|
|
"python3 code/script.py"'
|
|
|
|
|
|
run '### Code snippet 195
|
|
datalad create imagenette'
|
|
run '### Code snippet 196
|
|
cd imagenette
|
|
# 0.12.2 <= datalad < 0.13.4 needs the configuration option -c datalad.runtime.use-patool=1 to handle .tgz
|
|
datalad -c datalad.runtime.use-patool=1 download-url \
|
|
--archive \
|
|
--message "Download Imagenette dataset" \
|
|
'"'"'https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz'"'"' \'
|
|
run '### Code snippet 197
|
|
cd ../
|
|
datalad create -c text2git -c yoda ml-project'
|
|
run '### Code snippet 198
|
|
cd ml-project
|
|
mkdir -p data
|
|
# install the dataset into data/
|
|
datalad clone -d . ../imagenette data/raw'
|
|
run '### Code snippet 199
|
|
# show the directory hierarchy
|
|
tree -d'
|
|
run '### Code snippet 200
|
|
datalad containers-add software --url shub://adswa/python-ml:1'
|
|
run '### Code snippet 201
|
|
cat << EOT > code/prepare.py
|
|
#!/usr/bin/env python3
|
|
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
|
|
FOLDERS_TO_LABELS = {"n03445777": "golf ball",
|
|
"n03888257": "parachute"}
|
|
|
|
|
|
def get_files_and_labels(source_path):
|
|
images = []
|
|
labels = []
|
|
for image_path in source_path.rglob("*/*.JPEG"):
|
|
filename = image_path
|
|
folder = image_path.parent.name
|
|
if folder in FOLDERS_TO_LABELS:
|
|
images.append(filename)
|
|
label = FOLDERS_TO_LABELS[folder]
|
|
labels.append(label)
|
|
return images, labels
|
|
|
|
|
|
def save_as_csv(filenames, labels, destination):
|
|
data_dictionary = {"filename": filenames, "label": labels}
|
|
data_frame = pd.DataFrame(data_dictionary)
|
|
data_frame.to_csv(destination)
|
|
|
|
|
|
def main(repo_path):
|
|
data_path = repo_path / "data"
|
|
train_path = data_path / "raw/imagenette2-160/train"
|
|
test_path = data_path / "raw/imagenette2-160/val"
|
|
train_files, train_labels = get_files_and_labels(train_path)
|
|
test_files, test_labels = get_files_and_labels(test_path)
|
|
save_as_csv(train_files, train_labels, data_path / "train.csv")
|
|
save_as_csv(test_files, test_labels, data_path / "test.csv")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
repo_path = Path(__file__).parent.parent
|
|
main(repo_path)
|
|
EOT'
|
|
run '### Code snippet 202
|
|
datalad status'
|
|
run '### Code snippet 203
|
|
datalad save -m "Add script for data preparation for 2 categories" code/prepare.py'
|
|
run '### Code snippet 204
|
|
datalad containers-run -n software \
|
|
-m "Prepare the data for categories golf balls and parachutes" \
|
|
--input '"'"'data/raw/imagenette2-160/train/n03445777'"'"' \
|
|
--input '"'"'data/raw/imagenette2-160/val/n03445777'"'"' \
|
|
--input '"'"'data/raw/imagenette2-160/train/n03888257'"'"' \
|
|
--input '"'"'data/raw/imagenette2-160/val/n03888257'"'"' \
|
|
--output '"'"'data/train.csv'"'"' \
|
|
--output '"'"'data/test.csv'"'"' \
|
|
"python3 code/prepare.py"'
|
|
run '### Code snippet 205
|
|
cat << EOT > code/train.py
|
|
#!/usr/bin/env python3
|
|
|
|
from joblib import dump
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from skimage.io import imread_collection
|
|
from skimage.transform import resize
|
|
from sklearn.linear_model import SGDClassifier
|
|
|
|
|
|
def load_images(data_frame, column_name):
|
|
filelist = data_frame[column_name].to_list()
|
|
image_list = imread_collection(filelist)
|
|
return image_list
|
|
|
|
|
|
def load_labels(data_frame, column_name):
|
|
label_list = data_frame[column_name].to_list()
|
|
return label_list
|
|
|
|
|
|
def preprocess(image):
|
|
resized = resize(image, (100, 100, 3))
|
|
reshaped = resized.reshape((1, 30000))
|
|
return reshaped
|
|
|
|
|
|
def load_data(data_path):
|
|
df = pd.read_csv(data_path)
|
|
labels = load_labels(data_frame=df, column_name="label")
|
|
raw_images = load_images(data_frame=df, column_name="filename")
|
|
processed_images = [preprocess(image) for image in raw_images]
|
|
data = np.concatenate(processed_images, axis=0)
|
|
return data, labels
|
|
|
|
|
|
def main(repo_path):
|
|
train_csv_path = repo_path / "data/train.csv"
|
|
train_data, labels = load_data(train_csv_path)
|
|
sgd = SGDClassifier(max_iter=10)
|
|
trained_model = sgd.fit(train_data, labels)
|
|
dump(trained_model, repo_path / "model.joblib")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
repo_path = Path(__file__).parent.parent
|
|
main(repo_path)
|
|
EOT'
|
|
run '### Code snippet 206
|
|
datalad save -m "Add SGD classification script" code/train.py'
|
|
run '### Code snippet 207
|
|
cat << EOT > code/evaluate.py
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
from joblib import load
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from sklearn.metrics import accuracy_score
|
|
|
|
from train import load_data
|
|
|
|
|
|
def main(repo_path):
|
|
test_csv_path = repo_path / "data/test.csv"
|
|
test_data, labels = load_data(test_csv_path)
|
|
model = load(repo_path / "model.joblib")
|
|
predictions = model.predict(test_data)
|
|
accuracy = accuracy_score(labels, predictions)
|
|
metrics = {"accuracy": accuracy}
|
|
print(metrics)
|
|
accuracy_path = repo_path / "accuracy.json"
|
|
accuracy_path.write_text(json.dumps(metrics))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
repo_path = Path(__file__).parent.parent
|
|
main(repo_path)
|
|
EOT'
|
|
run '### Code snippet 208
|
|
datalad save -m "Add script to evaluate model performance" --version-tag "ready4analysis" code/evaluate.py'
|
|
run '### Code snippet 209
|
|
datalad containers-run -n software \
|
|
-m "Train an SGD classifier on the data" \
|
|
--input '"'"'data/raw/imagenette2-160/train/*'"'"' \
|
|
--output '"'"'model.joblib'"'"' \
|
|
"python3 code/train.py"'
|
|
run '### Code snippet 210
|
|
datalad containers-run -n software \
|
|
-m "Evaluate SGD classifier on test data" \
|
|
--input '"'"'data/raw/imagenette2-160/val/*'"'"' \
|
|
--output '"'"'accuracy.json'"'"' \
|
|
"python3 code/evaluate.py"'
|
|
run '### Code snippet 211
|
|
sed -i '"'"'s/SGDClassifier(max_iter=10)/SGDClassifier(max_iter=100)/g'"'"' code/train.py'
|
|
run '### Code snippet 212
|
|
git diff'
|
|
run '### Code snippet 213
|
|
datalad save -m "Increase the amount of iterations to 100" --version-tag "SGD-100" code/train.py'
|
|
run '### Code snippet 214
|
|
datalad rerun -m "Recompute classification with more iterations" ready4analysis..SGD-100'
|
|
run '### Code snippet 215
|
|
cat << EOT >| code/train.py
|
|
#!/usr/bin/env python3
|
|
|
|
from joblib import dump
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from skimage.io import imread_collection
|
|
from skimage.transform import resize
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
def load_images(data_frame, column_name):
|
|
filelist = data_frame[column_name].to_list()
|
|
image_list = imread_collection(filelist)
|
|
return image_list
|
|
|
|
def load_labels(data_frame, column_name):
|
|
label_list = data_frame[column_name].to_list()
|
|
return label_list
|
|
|
|
def preprocess(image):
|
|
resized = resize(image, (100, 100, 3))
|
|
reshaped = resized.reshape((1, 30000))
|
|
return reshaped
|
|
|
|
def load_data(data_path):
|
|
df = pd.read_csv(data_path)
|
|
labels = load_labels(data_frame=df, column_name="label")
|
|
raw_images = load_images(data_frame=df, column_name="filename")
|
|
processed_images = [preprocess(image) for image in raw_images]
|
|
data = np.concatenate(processed_images, axis=0)
|
|
return data, labels
|
|
|
|
def main(repo_path):
|
|
train_csv_path = repo_path / "data/train.csv"
|
|
train_data, labels = load_data(train_csv_path)
|
|
rf = RandomForestClassifier()
|
|
trained_model = rf.fit(train_data, labels)
|
|
dump(trained_model, repo_path / "model.joblib")
|
|
|
|
if __name__ == "__main__":
|
|
repo_path = Path(__file__).parent.parent
|
|
main(repo_path)
|
|
EOT'
|
|
run '### Code snippet 216
|
|
datalad save -m "Switch to random forest classification" --version-tag "random-forest" code/train.py'
|
|
run '### Code snippet 217
|
|
datalad rerun --branch="randomforest" -m "Recompute classification with random forest classifier" ready4analysis..SGD-100'
|
|
run '### Code snippet 218
|
|
git diff master -- accuracy.json'
|
|
run '### Code snippet 219
|
|
git checkout master
|
|
cat accuracy.json'
|