963 lines
37 KiB
HTML
963 lines
37 KiB
HTML
<!doctype html>
|
||
<html>
|
||
<head>
|
||
<meta charset="utf-8">
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
|
||
|
||
<!-- Edit me start! -->
|
||
<title>This is where your title goes</title>
|
||
<meta name="description" content=" This is where you put a short description ">
|
||
<meta name="author" content=" Your Name ">
|
||
<!-- Edit me end! -->
|
||
|
||
<link rel="stylesheet" href="../reveal.js/dist/reset.css">
|
||
<link rel="stylesheet" href="../reveal.js/dist/reveal.css">
|
||
<link rel="stylesheet" href="../reveal.js/dist/theme/beige.css">
|
||
|
||
<!-- Theme used for syntax highlighted code -->
|
||
<link rel="stylesheet" href="../reveal.js/plugin/highlight/monokai.css">
|
||
</head>
|
||
<body>
|
||
<div class="reveal">
|
||
<div class="slides">
|
||
|
||
<section>
|
||
<section>
|
||
<h2><small>Brainhack Global 2020 Ankara<br>🧠💻</small><br>An introduction to DataLad<br /><br /></h2>
|
||
|
||
<div style="margin-top:1em;text-align:center">
|
||
<table style="border: none;">
|
||
<tr>
|
||
<td>Adina Wagner
|
||
<br><small>
|
||
<a href="https://twitter.com/AdinaKrik" target="_blank">
|
||
<img data-src="../pics/twitter.png" style="height:30px;margin:0px" />
|
||
@AdinaKrik</a></small></td>
|
||
<td><img style="height:100px;margin-right:10px" data-src="../pics/fzj_logo.svg" />
|
||
<br></td>
|
||
</tr>
|
||
<tr>
|
||
<td>
|
||
<small><a href="http://psychoinformatics.de" target="_blank">Psychoinformatics lab</a>,
|
||
<br> Institute of Neuroscience and
|
||
Medicine, Brain & Behavior (INM-7)<br>
|
||
Research Center Jülich<br>
|
||
<a href="https://repronim.org" target="_blank">ReproNim/INCF fellow</a></small><br>
|
||
|
||
</td>
|
||
<td>
|
||
</td>
|
||
</tr>
|
||
</table>
|
||
</div>
|
||
</section>
|
||
</section>
|
||
|
||
<section>
|
||
|
||
<section data-transition="fade">
|
||
<div><table>
|
||
<tr><dl>
|
||
<img src="../pics/datalad_logo_wide.svg" height="150"><br>
|
||
<b><a href="https://www.datalad.org/" target="_blank"> DataLad</a>
|
||
can help <br> with small or large-scale <br> data management </b>
|
||
<dt></dt>
|
||
</dl></tr>
|
||
<tr><dl class="fragment fade-in">Free, <br> open source, <br> command line tool & Python API </dl></tr>
|
||
</table>
|
||
</div>
|
||
<ul style="vertical-align:middle">
|
||
<br>
|
||
<dt></dt>
|
||
</ul>
|
||
</section>
|
||
|
||
|
||
<section>
|
||
<h2>some <img src="../pics/datalad_logo_wide.svg"> Basics</h2>
|
||
|
||
<ul>
|
||
<li>A command-line tool, available for all major operating systems
|
||
(Linux, macOS/OSX, Windows), MIT-licensed</li>
|
||
<li>Build on top of <a href="https://git-scm.com/" target="_blank">Git</a>
|
||
and <a href="https://git-annex.branchable.com/" target="_blank">Git-annex</a></li>
|
||
<dt><li>Allows...</li></dt>
|
||
<dt>... version-controlling arbitrarily large content </dt>
|
||
<dd>version control data and software alongside to code!</dd>
|
||
<dt>... transport mechanisms for sharing and obtaining data </dt>
|
||
<dd>consume and collaborate on data (analyses) like software</dd>
|
||
<dt>... (computationally) reproducible data analysis</dt>
|
||
<dd>Track and share provenance of all digital objects</dd>
|
||
<dt>... and <i>much</i> more </dt>
|
||
<li>Completely domain-agnostic</li>
|
||
<br>
|
||
</ul>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>A few things that DataLad can help with</h2>
|
||
<ul class="fragment fade-in">
|
||
<li>
|
||
Getting data
|
||
</li>
|
||
<li>
|
||
Keeping a project clean and orderly
|
||
</li>
|
||
<li>
|
||
Computationally reproducible data analysis
|
||
</li>
|
||
</ul>
|
||
<br><br>
|
||
<div class="fragment fade-in">There is much more, and you can read about it in <br>
|
||
The DataLad Handbook
|
||
(<a href="http://handbook.datalad.org" target="_blank">handbook.datalad.org</a>) <br>
|
||
<img src="../pics/logo.svg" height="250px"> </div>
|
||
</section>
|
||
|
||
|
||
|
||
<section>
|
||
<h2>Acknowledgements</h2>
|
||
<table>
|
||
<tr style="vertical-align:middle">
|
||
<td style="vertical-align:middle">
|
||
<dl>
|
||
<dt>Software</dt>
|
||
<dd style="margin-left:5px!important">
|
||
<ul style="margin-left:5px!important">
|
||
<li>Michael Hanke</li>
|
||
<li>Yaroslav Halchenko</li>
|
||
<li>Joey Hess (git-annex)</li>
|
||
<li>Kyle Meyer</li>
|
||
<li>Benjamin Poldrack</li>
|
||
<li><em>26 additional contributors</em></li>
|
||
</ul>
|
||
</dd>
|
||
<dt style="margin-top:20px">Documentation project </dt>
|
||
<dd style="margin-left:5px!important">
|
||
<ul style="margin-left:5px!important">
|
||
<li>Michael Hanke</li>
|
||
<li>Laura Waite</li>
|
||
<li><em>28 additional contributors</em></li>
|
||
</ul>
|
||
</dd>
|
||
</dl>
|
||
</td>
|
||
<td style="vertical-align:middle">
|
||
<div style="margin-bottom:-20px;text-align:center"><strong>Funders</strong></div>
|
||
<img style="height:150px;margin-right:50px" data-src="../pics/nsf.png" />
|
||
<img style="height:150px;margin-right:50pxi;margin-left:50px" data-src="../pics/binc.png" />
|
||
<img style="height:150px;margin-left:50px" data-src="../pics/bmbf.png" />
|
||
<br />
|
||
<img style="height:80px;margin-top:-40px;margin-left:auto;margin-right:auto;width:100%" data-src="../pics/fzj_logo.svg" />
|
||
<div style="margin-top:-20px">
|
||
<img style="height:60px;margin-right:20px" data-src="../pics/erdf.png" />
|
||
<img style="height:60px;margin-right:20px" data-src="../pics/cbbs_logo.png" />
|
||
<img style="height:60px" data-src="../pics/LSA-Logo.png" />
|
||
</div>
|
||
<div style="margin-top:40px;margin-bottom:20px;text-align:center"><strong>Collaborators</strong></div>
|
||
<div style="margin-top:-20px">
|
||
<img style="height:100px;margin:20px" data-src="../pics/hbp_logo.png" />
|
||
<img style="height:100px;margin:20px" data-src="../pics/conp_logo.png" />
|
||
<img style="height:100px;margin:20px" data-src="../pics/vbc_logo.png" />
|
||
</div>
|
||
<div style="margin-top:-40px">
|
||
<img style="height:120px;margin:20px" data-src="../pics/openneuro_logo.png" />
|
||
<img style="height:120px;margin:20px" data-src="../pics/cbrain_logo.png" />
|
||
<img style="height:140px;margin:20px" data-src="../pics/brainlife_logo.png" />
|
||
</div>
|
||
</td>
|
||
</tr>
|
||
</table>
|
||
</section>
|
||
|
||
|
||
<section>
|
||
<h2>Everything happens in DataLad datasets</h2>
|
||
<ul style="font-size:35px">
|
||
<li>DataLad's core data structure</li>
|
||
<ul>
|
||
<li>Dataset = A directory managed by DataLad</li>
|
||
<li>A Git/git-annex repository</li>
|
||
<li>Any directory of your computer can be managed by DataLad.</li>
|
||
<li class="fragment fade-in" data-fragment-index="1">Datasets can be <i>created</i> (from scratch) or <i>installed</i></li>
|
||
</ul>
|
||
</ul>
|
||
|
||
<img class="fragment fade-in" data-fragment-index="1" src="../pics/artwork/src/dataset.svg" width="400">
|
||
<img class="fragment fade-in" data-fragment-index="1" src="../pics/artwork/src/collaboration.svg" width="600">
|
||
<div class="fragment fade-in">
|
||
<small>File viewer and terminal view of a DataLad dataset</small><br>
|
||
<img src="../pics/remodnav-ds-nautilus.png" width="500"> <img src="../pics/remodnav-ds-terminal.png" width="500">
|
||
</div>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Using DataLad</h2>
|
||
|
||
<ul>
|
||
<div>
|
||
<li>DataLad can be used from the command line</li>
|
||
<pre><code>datalad create mydataset</code></pre></div>
|
||
<div>
|
||
<li>... or with its Python API</li>
|
||
<pre><code class="python">import datalad.api as dl
|
||
dl.create(path="mydataset")</code></pre></div>
|
||
<div class="fragment fade-in">
|
||
<li>... and other programming languages can use it via system call</li>
|
||
<pre><code class="python"># in R
|
||
> system("datalad create mydataset")
|
||
</code></pre></div>
|
||
</ul>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
|
||
<section>
|
||
<section data-transition="None">
|
||
<h2>Getting data</h2>
|
||
<ul>
|
||
<li>Datasets can be used to distribute data</li>
|
||
<li>
|
||
You can <code>clone</code> a dataset from a public or private place
|
||
and get access to the data it tracks</li>
|
||
</ul>
|
||
<img height="850" class="fragment fade-in" src="../pics/clonedata.gif" alt="a screenrecording of cloning studyforrest data from github">
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2></h2>
|
||
<ul>
|
||
<li class="fragment fade-in">Datasets are light-weight: Upon installation, only small
|
||
files and meta data about file availability are retrieved, but <b>no file content</b>.</li>
|
||
<img class="fragment fade-in" src="../pics/getdata.gif" height="700">
|
||
</ul>
|
||
<pre class="fragment fade-in"><code>$ datalad clone git@github.com:psychoinformatics-de/studyforrest-data-phase2.git
|
||
install(ok): /tmp/studyforrest-data-phase2 (dataset)
|
||
$ cd studyforrest-data-phase2 && du -sh
|
||
18M . # its tiny!</code></pre>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Getting data</h2>
|
||
<ul>
|
||
<li>A cloned dataset gets you access to plenty of data, but has only little disk-usage </li>
|
||
<li class="fragment fade-in">Specific file contents can be retrieved on demand via <code>datalad get</code>:</li>
|
||
</ul>
|
||
<pre class="fragment fade-in"><code>$ datalad get sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
|
||
get(ok): /tmp/studyforrest-data-phase2/sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file) [from mddatasrc...]</code></pre>
|
||
<ul>
|
||
<li class="fragment fade-in">You can also drop file content if you don't need it anymore with <code>datalad drop</code>:</li>
|
||
</ul>
|
||
<pre class="fragment fade-in-then-semi-out"><code>$ datalad drop sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
|
||
drop(ok): /tmp/studyforrest-data-phase2/sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file) [checking https://arxiv.org/pdf/0904.3664v1.pdf...]</code></pre>
|
||
|
||
<li class="fragment fade-in">Feature: Have access to more data than your computer has disk-space!</li>
|
||
<pre class="fragment fade-in"><code># eNKI dataset (1.5TB, 34k files):
|
||
$ du -sh
|
||
1.5G .
|
||
# HCP dataset (80TB, 15 million files)
|
||
$ du -sh
|
||
48G .
|
||
</code></pre>
|
||
</section>
|
||
|
||
|
||
<section>
|
||
<h2>Getting data</h2>
|
||
<ul>
|
||
<li>You can get more than 200TB of public data with DataLad, for example...</li>
|
||
<ul class="fragment fade-in">
|
||
<li>All OpenNeuro datasets:
|
||
<a href="https://github.com/OpenNeuroDatasets/" target="_blank">
|
||
github.com/OpenNeuroDatasets
|
||
</a><br>
|
||
<pre><code>$ datalad clone https://github.com/OpenNeuroDatasets/ds003171.git</code></pre></li>
|
||
<li>The human connectome project data (full, and in subsets):
|
||
<a href="https://github.com/datalad-datasets/human-connectome-project-openaccess" target="_blank">
|
||
github.com/datalad-datasets/human-connectome-project-openaccess
|
||
</a><pre><code>$ datalad clone https://github.com/datalad-datasets/human-connectome-project-openaccess.git</code></pre>
|
||
</li>
|
||
<li>
|
||
ABIDE (I-II), INDI, ADH200, CORR, Healthy Brain Network SSI, and many more in
|
||
<a href="http://datasets.datalad.org/" target="_blank">the DataLad superdataset (datasets.datalad.org)</a>
|
||
<pre><code>$ datalad clone ///</code></pre>
|
||
</li>
|
||
</ul>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
|
||
<section>
|
||
<section>
|
||
<h2>Keeping a project clean and orderly</h2>
|
||
<img src="../pics/frontend_vs_backend_paper.png" style="box-shadow: 10px 10px 8px #888888;height=1000px">
|
||
<imgcredit>adapted from https://dribbble.com/shots/3090048-Front-end-vs-Back-end</imgcredit>
|
||
<br>⬆<br>
|
||
This a metaphor for most projects after publication
|
||
<aside class="notes">
|
||
mention irreprodubility of unmanaged studies, hence funders require FAIR data management
|
||
mention peer expectations
|
||
</aside>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Keeping a project clean and orderly</h2>
|
||
<img class="fragment fade-in-then-semi-out"
|
||
data-fragment-index="1" height="200"
|
||
src="../pics/masterplan.png">
|
||
<ul style="font-size:35px">
|
||
<li class="fragment fade-in" data-fragment-index="2">
|
||
Much of neuroscientific research is computationally intensive, with
|
||
complex workflows from raw data to result, and plenty of researchers
|
||
degrees of freedom
|
||
</li>
|
||
</ul>
|
||
<img class="fragment fade-in" data-fragment-index="2" src="../pics/dataflow.jpg">
|
||
<imgcredit>
|
||
<a href="https://www.frontiersin.org/articles/10.3389/fninf.2012.00009/full" target="_blank">
|
||
Poline et al., 2011</a></imgcredit>
|
||
</section>
|
||
|
||
|
||
<section data-transition="None">
|
||
<h2>Complex analysis ➝ chaotic projects </h2>
|
||
"Shit, which version of which script produced these outputs from which version
|
||
of what data?"<br>
|
||
<img src="../pics/turingway/manuallabor.png">
|
||
<img src="../pics/turingway/findfiles.png" height="400">
|
||
<img src="../pics/turingway/projectstack.png" height="350">
|
||
<imgcredit>CC-BY Scriberia and <a href="https://the-turing-way.netlify.app/reproducible-research/rdm.html" target="_blank">
|
||
The Turing Way</a>
|
||
</imgcredit>
|
||
</section>
|
||
|
||
|
||
<section data-transition="None">
|
||
<h2>Keeping a project clean and orderly</h2>
|
||
<table>
|
||
<tr>
|
||
<td>
|
||
<img class="fragment fade-in" data-fragment-index="1" src="../pics/turingway/ProjectHistory.png" width="600">
|
||
<imgcredit><a href="https://the-turing-way.netlify.app/reproducible-research/vcs/vcs-data.html" target="_blank">
|
||
CC-BY Scriberia & The Turing Way</a>
|
||
</imgcredit>
|
||
</td>
|
||
<td>
|
||
<ul style="font-size:35px">
|
||
<dt class="fragment fade-in" data-fragment-index="1" >Version control</dt>
|
||
<li class="fragment fade-in" data-fragment-index="2">keep things organized</li>
|
||
<li class="fragment fade-in" data-fragment-index="2">keep track of changes</li>
|
||
<li class="fragment fade-in" data-fragment-index="2">revert changes or go <br>
|
||
back to previous states</li>
|
||
</ul>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td>
|
||
<img class="fragment fade-in" data-fragment-index="3" src="../pics/virtual_dstree_short.svg" width="300">
|
||
</td>
|
||
<td>
|
||
<ul style="font-size:35px">
|
||
<dt class="fragment fade-in" data-fragment-index="3" >Intuitive structure</dt>
|
||
<li class="fragment fade-in" data-fragment-index="4">Follow the
|
||
<a href="" target="_blank">YODA principles</a> </li>
|
||
</ul>
|
||
</td>
|
||
</tr>
|
||
</table>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Keeping a project clean and orderly</h2>
|
||
First, let's create a new data analysis dataset with <code>datalad create</code>
|
||
<pre><code>$ datalad create -c yoda myanalysis
|
||
[INFO ] Creating a new annex repo at /tmp/myanalysis
|
||
[INFO ] Scanning for unlocked files (this may take some time)
|
||
[INFO ] Running procedure cfg_yoda
|
||
[INFO ] == Command start (output follows) =====
|
||
[INFO ] == Command exit (modification check follows) =====
|
||
create(ok): /tmp/myanalysis (dataset) </code></pre>
|
||
<li><code>-c yoda</code> applies useful pre-structuring and configurations:</li>
|
||
<pre><code>$ tree
|
||
.
|
||
├── CHANGELOG.md
|
||
├── code
|
||
│ └── README.md
|
||
└── README.md
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Version Control</h2>
|
||
|
||
<ul>
|
||
<li>DataLad knows two things: Datasets and files</li>
|
||
<img class="fragment fade-in" data-fragment-index="1" style="box-shadow: 5px 5px 3px #888888" src="../pics/artwork/src/dataset.svg" height="330"> <img style="box-shadow: 5px 5px 3px #888888" height="330" class="fragment fade-in" data-fragment-index="2" src="../pics/artwork/src/local_wf.svg">
|
||
</ul><br>
|
||
<li class="fragment fade-in">
|
||
Every file you put into a in a dataset can be easily version-controlled,
|
||
regardless of size, with the same command: <code>datalad save</code> </li>
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2>Version control</h2>
|
||
<li>Example: Add a new file into a dataset</li>
|
||
<pre><code data-line-numbers="1-3"># create a data analysis script
|
||
$ datalad status
|
||
untracked: code/script.py (file)
|
||
$ git status
|
||
On branch master
|
||
Untracked files:
|
||
(use "git add file..." to include in what will be committed)
|
||
code/script.py
|
||
|
||
nothing added to commit but untracked files present (use "git add" to track)
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2>Version control</h2>
|
||
<li>Example: Add a new file into a dataset</li>
|
||
<pre><code data-line-numbers="4-10"># create a data analysis script
|
||
$ datalad status
|
||
untracked: code/script.py (file)
|
||
$ git status
|
||
On branch master
|
||
Untracked files:
|
||
(use "git add file..." to include in what will be committed)
|
||
code/script.py
|
||
|
||
nothing added to commit but untracked files present (use "git add" to track)
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2>Version control</h2>
|
||
<li>Example: Add a new file into a dataset</li>
|
||
<pre><code data-line-numbers="0"># create a data analysis script
|
||
$ datalad status
|
||
untracked: code/script.py (file)
|
||
$ git status
|
||
On branch master
|
||
Untracked files:
|
||
(use "git add file..." to include in what will be committed)
|
||
code/script.py
|
||
|
||
nothing added to commit but untracked files present (use "git add" to track)
|
||
</code></pre>
|
||
<li>Save the dataset modification</li>
|
||
<pre><code> $ datalad save -m "Add a k-nearest-neighbour clustering analysis" code/script.py </code></pre>
|
||
</section>
|
||
|
||
<section data-markdown><script type="text/template" >
|
||
|
||
### This means: You can also version control data! <!-- .element: class="fragment" -->
|
||
|
||
<pre><code class="bash" style="max-height:none">$ datalad save \
|
||
-m "Adding raw data from neuroimaging study 1" \
|
||
sub-*
|
||
add(ok): sub-1/anat/T1w.json (file)
|
||
add(ok): sub-1/anat/T1w.nii.gz (file)
|
||
add(ok): sub-1/anat/T2w.json (file)
|
||
add(ok): sub-1/anat/T2w.nii.gz (file)
|
||
add(ok): sub-1/func/sub-1-run-1_bold.json (file)
|
||
add(ok): sub-1/func/sub-1-run-1_bold.nii.gz (file)
|
||
add(ok): sub-10/anat/T1w.json (file)
|
||
add(ok): sub-10/anat/T1w.nii.gz (file)
|
||
add(ok): sub-10/anat/T2w.json (file)
|
||
add(ok): sub-10/anat/T2w.nii.gz (file)
|
||
[110 similar messages have been suppressed]
|
||
save(ok): . (dataset)
|
||
action summary:
|
||
add (ok: 120)
|
||
save (ok: 1)
|
||
</code></pre> <!-- .element: class="fragment" -->
|
||
<strong>Why should you version control more than just your code?</strong><!-- .element: class="fragment" -->
|
||
|
||
Because all building blocks of your analysis evolve<!-- .element: class="fragment" -->
|
||
</script>
|
||
</section>
|
||
|
||
<section>
|
||
<img src="../pics/phd052810s.gif"><br>
|
||
Version controlling data allows to track data changes and uniquely identify
|
||
precise versions that were used in your analysis
|
||
</section>
|
||
<section>
|
||
<h2>Local version control</h2>
|
||
|
||
<p>Procedurally, version control is easy with DataLad!</p>
|
||
<img class="fragment fade-in" src="../pics/local_wf.svg" height="500"> <!-- .element: class="fragment" -->
|
||
<br>
|
||
|
||
<ul style="font-size:35px"><p class="fragment fade-in">
|
||
Stay flexible:
|
||
<li class="fragment fade-in">Non-complex DataLad core API (easier than Git)</li>
|
||
<li class="fragment fade-in">Pure Git or git-annex commands (for regular Git or git-annex users, or to use specific functionality)</li>
|
||
</ul></p>
|
||
<b class="fragment fade-in">Advice:</b>
|
||
<ul>
|
||
<li class="fragment fade-in">Save <i>meaningful</i> units of change</li>
|
||
<li class="fragment fade-in">Attach helpful commit messages</li>
|
||
</ul>
|
||
</section>
|
||
|
||
<section data-markdown><script type="text/template" >
|
||
## Version Control
|
||
* Your dataset can be a complete research log, capturing everything that was done, when, by whom, and how
|
||

|
||
* Interact with the history:
|
||
* reset your dataset (or subset of it) to a previous state,
|
||
* throw out changes or bring them back,
|
||
* find out what was done when, how, why, and by whom
|
||
* Identify precise versions: Use data in the most recent version, or the one from 2018, or...
|
||
* ...
|
||
</script>
|
||
</section>
|
||
|
||
<section data-markdown><script type="text/template">
|
||
## From here <span class="fragment" data-fragment-index="1" style="margin-left:350px">to this:</span>
|
||
<!-- .element: height="780" style="box-shadow: 10px 10px 8px #888888" -->
|
||
<!-- .element: class="fragment" data-fragment-index="1" height="780" style="box-shadow: 10px 10px 8px #888888" -->
|
||
<imgcredit>www.phdcomics.com; www.linode.com</imgcredit>
|
||
|
||
<p class="fragment" data-fragment-index="2">BUT: Version control is only one aspect of data management</p>
|
||
|
||
<aside class="notes">
|
||
Note to self
|
||
</aside>
|
||
</script>
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2>Intuitive data analysis structure</h2>
|
||
|
||
<li>You can link datasets together in superdataset-subdataset hierarchies:</li>
|
||
<img src="../pics/artwork/src/linkage_subds.svg" width="900"> <br>
|
||
<pre><code style="max-width:none" data-line-numbers="3">$ cd myanalysis
|
||
# we can install analysis input data as a subdataset to the dataset
|
||
$ datalad clone -d . https://github.com/datalad-handbook/iris_data.git input/
|
||
[INFO ] Scanning for unlocked files (this may take some time)
|
||
[INFO ] Remote origin not usable by git-annex; setting annex-ignore
|
||
install(ok): input (dataset)
|
||
add(ok): input (file)
|
||
add(ok): .gitmodules (file)
|
||
save(ok): . (dataset)
|
||
action summary:
|
||
add (ok: 2)
|
||
install (ok: 1)
|
||
save (ok: 1)
|
||
</code></pre>
|
||
<!-- <ul>
|
||
<li class="fragment fade-in" data-fragment-index="2">Overcomes scaling issues with large amounts of files</li>
|
||
<pre class="fragment fade-in" data-fragment-index="2"><code>adina@bulk1 in /ds/hcp/super on git:master❱ datalad status --annex -r
|
||
15530572 annex'd files (77.9 TB recorded total size)
|
||
nothing to save, working tree clean</code></pre>
|
||
<small><a class="fragment fade-in" data-fragment-index="2" href="https://github.com/datalad-datasets/human-connectome-project-openaccess" target="_blank">(github.com/datalad-datasets/human-connectome-project-openaccess)</a></small>
|
||
<li class="fragment fade-in">Modularizes research components for transparency, reuse, and access management</li>
|
||
</ul>
|
||
-->
|
||
</section>
|
||
|
||
|
||
<section data-transition="None">
|
||
<h2>Intuitive data analysis structure</h2>
|
||
|
||
<li>You can link datasets together in superdataset-subdataset hierarchies:</li>
|
||
<img src="../pics/artwork/src/linkage_subds.svg" width="900"> <br>
|
||
<pre><code style="max-width:none" >$ tree
|
||
.
|
||
├── CHANGELOG.md
|
||
├── code
|
||
│ ├── README.md
|
||
│ └── script.py
|
||
└── input
|
||
└── iris.csv</code></pre>
|
||
<!-- <ul>
|
||
<li class="fragment fade-in" data-fragment-index="2">Overcomes scaling issues with large amounts of files</li>
|
||
<pre class="fragment fade-in" data-fragment-index="2"><code>adina@bulk1 in /ds/hcp/super on git:master❱ datalad status --annex -r
|
||
15530572 annex'd files (77.9 TB recorded total size)
|
||
nothing to save, working tree clean</code></pre>
|
||
<small><a class="fragment fade-in" data-fragment-index="2" href="https://github.com/datalad-datasets/human-connectome-project-openaccess" target="_blank">(github.com/datalad-datasets/human-connectome-project-openaccess)</a></small>
|
||
<li class="fragment fade-in">Modularizes research components for transparency, reuse, and access management</li>
|
||
</ul>
|
||
-->
|
||
</section>
|
||
|
||
|
||
<section>
|
||
<h2>Basic organizational principles for datasets</h2>
|
||
<dl>
|
||
<dt>Keep everything clean and modular</dt>
|
||
<li>An analysis is a superdataset, its components are subdatasets, and its structure modular</li>
|
||
<table>
|
||
<tr>
|
||
<td><img src="../pics/dataset_modules.png" height="400"></td>
|
||
<td><pre><code class="bash" style="max-height:none">├── code/
|
||
│ ├── tests/
|
||
│ └── myscript.py
|
||
├── docs
|
||
│ ├── build/
|
||
│ └── source/
|
||
├── envs
|
||
│ └── Singularity
|
||
├── inputs/
|
||
│ └─── data/
|
||
│ ├── dataset1/
|
||
│ │ └── datafile_a
|
||
│ └── dataset2/
|
||
│ └── datafile_a
|
||
├── outputs/
|
||
│ └── important_results/
|
||
│ └── figures/
|
||
└── README.md</code></pre></td>
|
||
</tr>
|
||
</table>
|
||
|
||
</dl>
|
||
<ul>
|
||
<li>do not touch/modify raw data: save any results/computations <i>outside</i> of input datasets</li>
|
||
<li>Keep a superdataset self-contained: Scripts reference subdatasets or files with <i>relative paths</i></li>
|
||
</ul>
|
||
<small>Find out more about organizational principles in
|
||
<a href="" target="_blank">the YODA principles</a>!</small>
|
||
</section>
|
||
|
||
</section>
|
||
|
||
|
||
<section>
|
||
<section>
|
||
<h2>Computationally reproducible data analysis</h2>
|
||
<br> This a metaphor for reproducing (your own) research <br> a few months after publication <br>⬇<br>
|
||
<img src="../pics/frustration.jpg" height="500" style="box-shadow: 10px 10px 8px #888888x">
|
||
<small>Write-up: <a href="http://handbook.datalad.org/en/latest/basics/101-130-yodaproject.html">
|
||
handbook.datalad.org/en/latest/basics/101-130-yodaproject.html
|
||
</a> </small>
|
||
</section>
|
||
|
||
|
||
<section>
|
||
<h2>A classification analysis on the iris flower dataset</h2>
|
||
<img src="../pics/iris-machinelearning.png" height="300">
|
||
<img src="../pics/iris_cluster.png" height="450">
|
||
<small>Write-up: <a href="http://handbook.datalad.org/en/latest/basics/101-130-yodaproject.html">
|
||
handbook.datalad.org/en/latest/basics/101-130-yodaproject.html
|
||
</a> </small>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Reproducible execution & provenance capture</h2>
|
||
<p>datalad run</p>
|
||
<img class="fragment fade-in" src="../pics/run_prov.svg" height="600"> <!-- .element: class="fragment" -->
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2>Computational reproducibility</h2>
|
||
How can I execute the analysis script on my input data in a computationally
|
||
reproducible manner?
|
||
<pre><code data-line-numbers="1-5">$ datalad run -m "analyze iris data with classification analysis" \
|
||
--input "input/iris.csv" \
|
||
--output "prediction_report.csv" \
|
||
--output "pairwise_relationships.png" \
|
||
"python3 code/script.py"
|
||
[INFO ] Making sure inputs are available (this may take some time)
|
||
get(ok): input/iris.csv (file) [from web...]
|
||
[INFO ] == Command start (output follows) =====
|
||
[INFO ] == Command exit (modification check follows) =====
|
||
add(ok): pairwise_relationships.png (file)
|
||
add(ok): prediction_report.csv (file)
|
||
save(ok): . (dataset)
|
||
action summary:
|
||
add (ok: 2)
|
||
get (notneeded: 2, ok: 1)
|
||
save (notneeded: 1, ok: 1)
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2>Computational reproducibility</h2>
|
||
How can I execute the analysis script on my input data in a computationally
|
||
reproducible manner?
|
||
<pre><code data-line-numbers="6-15">$ datalad run -m "analyze iris data with classification analysis" \
|
||
--input "input/iris.csv" \
|
||
--output "prediction_report.csv" \
|
||
--output "pairwise_relationships.png" \
|
||
"python3 code/script.py"
|
||
[INFO ] Making sure inputs are available (this may take some time)
|
||
get(ok): input/iris.csv (file) [from web...]
|
||
[INFO ] == Command start (output follows) =====
|
||
[INFO ] == Command exit (modification check follows) =====
|
||
add(ok): pairwise_relationships.png (file)
|
||
add(ok): prediction_report.csv (file)
|
||
save(ok): . (dataset)
|
||
action summary:
|
||
add (ok: 2)
|
||
get (notneeded: 2, ok: 1)
|
||
save (notneeded: 1, ok: 1)
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2>Computational reproducibility</h2>
|
||
|
||
<li>A datalad run command produces a machine-readable record, identifiable
|
||
via commit hash</li>
|
||
<pre><code style="max-height:none">$ git log
|
||
commit df2dae9b5af184a0c463708acf8356b877c511a8 (HEAD -> master)
|
||
Author: Adina Wagner adina.wagner@t-online.de
|
||
Date: Tue Dec 1 11:58:18 2020 +0100
|
||
|
||
[DATALAD RUNCMD] analyze iris data with classification analysis
|
||
|
||
=== Do not change lines below ===
|
||
{
|
||
"chain": [],
|
||
"cmd": "python3 code/script.py",
|
||
"dsid": "9ffdbfcd-f4af-429a-b64a-0c81b48b7f62",
|
||
"exit": 0,
|
||
"extra_inputs": [],
|
||
"inputs": [
|
||
"input/iris.csv"
|
||
],
|
||
"outputs": [
|
||
"prediction_report.csv",
|
||
"pairwise_relationships.png"
|
||
],
|
||
"pwd": "."
|
||
}
|
||
^^^ Do not change lines above ^^^
|
||
</code></pre>
|
||
</section>
|
||
|
||
<section data-transition="None">
|
||
<h2>Computational reproducibility</h2>
|
||
|
||
<li>A datalad run command produces a machine-readable record, identifiable
|
||
via commit hash</li>
|
||
<pre><code style="max-height:none">$ git log
|
||
commit df2dae9b5af184a0c463708acf8356b877c511a8 (HEAD -> master)
|
||
Author: Adina Wagner adina.wagner@t-online.de
|
||
Date: Tue Dec 1 11:58:18 2020 +0100
|
||
|
||
[DATALAD RUNCMD] analyze iris data with classification analysis
|
||
|
||
[...]
|
||
</code></pre>
|
||
<li>You can <code>rerun</code> this hash to repeat the
|
||
analysis:
|
||
<pre><code> $ datalad rerun df2dae9b5af1
|
||
datalad rerun df2dae9b5af18
|
||
[INFO ] run commit df2dae9; (analyze iris data...)
|
||
[INFO ] Making sure inputs are available (this may take some time)
|
||
unlock(ok): pairwise_relationships.png (file)
|
||
unlock(ok): prediction_report.csv (file)
|
||
[INFO ] == Command start (output follows) =====
|
||
[INFO ] == Command exit (modification check follows) =====
|
||
add(ok): pairwise_relationships.png (file)
|
||
add(ok): prediction_report.csv (file)
|
||
action summary:
|
||
add (ok: 2)
|
||
get (notneeded: 3)
|
||
save (notneeded: 2)
|
||
unlock (ok: 2)
|
||
</code></pre></li>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Computational reproducibility</h2>
|
||
<ul>
|
||
<li>Code may fail (to reproduce) if run with different software</li>
|
||
<li>Datasets can store (and share) software environments (Docker or Singularity containers)
|
||
and reproducibly execute code inside of the software container, capturing software as additional
|
||
provenance</li>
|
||
<li>DataLad extension: <code>datalad-container</code></li>
|
||
</ul>
|
||
|
||
<p>datalad-containers run</p>
|
||
<img class="fragment fade-in" src="../pics/containers-run.svg" height="600"> <!-- .element: class="fragment" -->
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Computational reproducibility</h2>
|
||
<li>You can add (any amount of) software containers to your dataset to link a
|
||
software environment to your analysis</li>
|
||
<pre><code>$ datalad containers-add software --url shub://adswa/resources:2
|
||
[INFO ] Initiating special remote datalad
|
||
add(ok): .datalad/config (file)
|
||
save(ok): . (dataset)
|
||
containers_add(ok): /tmp/myanalysis/.datalad/environments/software/image (file)
|
||
action summary:
|
||
add (ok: 1)
|
||
containers_add (ok: 1)
|
||
save (ok: 1)
|
||
</code></pre>
|
||
<small>Write-up: <a href="http://handbook.datalad.org/en/latest/basics/101-133-containersrun.html">
|
||
http://handbook.datalad.org/en/latest/basics/101-133-containersrun.html
|
||
</a> </small>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Computational reproducibility</h2>
|
||
<li><code>datalad containers-run</code> will execute the command in the specified
|
||
software environment</li>
|
||
<pre><code>$ datalad containers-run -m "rerun analysis in container" \
|
||
--container-name midterm-software \
|
||
--input "input/iris.csv" \
|
||
--output "prediction_report.csv" \
|
||
--output "pairwise_relationships.png" \
|
||
"python3 code/script.py"
|
||
[INFO] Making sure inputs are available (this may take some time)
|
||
[INFO] == Command start (output follows) =====
|
||
[INFO] == Command exit (modification check follows) =====
|
||
unlock(ok): pairwise_relationships.png (file)
|
||
unlock(ok): prediction_report.csv (file)
|
||
add(ok): pairwise_relationships.png (file)
|
||
add(ok): prediction_report.csv (file)
|
||
save(ok): . (dataset)
|
||
action summary:
|
||
add (ok: 2)
|
||
get (notneeded: 4)
|
||
save (notneeded: 1, ok: 1)
|
||
unlock (ok: 2)</code></pre>
|
||
<li>... And a <code>datalad rerun</code> will repeat the analysis in the
|
||
specified software environment</li>
|
||
</section>
|
||
</section>
|
||
|
||
<section>
|
||
<section>
|
||
<h2>A quick summary of this sneak peek</h2>
|
||
<ul>
|
||
<li>Getting data</li>
|
||
<ul class="fragment fade-in" style="font-size:30px">
|
||
<li>You can retrieve DataLad datasets with "datalad clone url/path"</li>
|
||
<li>A dataset allows you to retrieve data on demand via "datalad get"</li>
|
||
<li>You can drop unused data to free up disk space with "datalad drop"</li>
|
||
</ul>
|
||
<li class="fragment fade-in" >Keeping projects clean</li>
|
||
<ul class="fragment fade-in" style="font-size:30px">
|
||
<li>Create a dataset for data analysis using "datalad create -c yoda mydatasetname"</li>
|
||
<li>In this dataset, DataLad can version control data of any size with "datalad save"</li>
|
||
<li>You can link individual datasets as reusable and intuitive modular components,
|
||
for example your input data to your analysis, with "datalad clone -d . url"</li>
|
||
</ul>
|
||
<li class="fragment fade-in" >Computational reproducibility</li>
|
||
<ul class="fragment fade-in" style="font-size:30px">
|
||
<li>"datalad run" can create a digital, machine-readable, and re-executable record of how you
|
||
did your data analysis</li>
|
||
<li>You or others can redo the analysis automatically with "datalad rerun"</li>
|
||
<li>You can even link software environments to your analysis with the "datalad-container"
|
||
extension, and run analysis with "datalad containers-run"</li>
|
||
</ul>
|
||
</ul>
|
||
</section>
|
||
|
||
<section>
|
||
<h2>Is there more?</h2>
|
||
<ul>
|
||
Yes, a lot!
|
||
<li class="fragment fade-in">For example: <a href="http://handbook.datalad.org/en/latest/usecases/collaborative_data_management.html" target="_blank">
|
||
Collaborative data analysis workflows</a> </li>
|
||
<li class="fragment fade-in"><a href="http://handbook.datalad.org/en/latest/basics/basics-thirdparty.html" target="_blank">
|
||
Publishing data</a> </li>
|
||
<li class="fragment fade-in"><a href="http://handbook.datalad.org/en/latest/usecases/reproducible-paper.html" target="_blank">
|
||
Writing reproducible papers</a> </li>
|
||
<li class="fragment fade-in"><a href="http://handbook.datalad.org/en/latest/usecases/ml-analysis.html" target="_blank">
|
||
computationally reproducible machine learning pipelines</a> </li>
|
||
<li class="fragment fade-in">...</li>
|
||
</ul>
|
||
</section>
|
||
|
||
|
||
<section>
|
||
<h2>Resources and Further Reading</h2>
|
||
<table>
|
||
<tr>
|
||
<td>
|
||
Comprehensive user documentation in the<br>
|
||
DataLad Handbook
|
||
<a href="http://handbook.datalad.org" target="_blank">(handbook.datalad.org)</a>
|
||
</td>
|
||
<td>
|
||
<img src="../pics/logo.svg" height="150">
|
||
</td>
|
||
</tr>
|
||
</table>
|
||
|
||
<table>
|
||
<tr>
|
||
<td><img src="../pics/artwork/src/enter.svg" height="100"></a></td>
|
||
<td>
|
||
<ul>
|
||
<li>High-level function/command overviews, <br>
|
||
Installation, Configuration, Cheatsheet</li>
|
||
</ul>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td><img src="../pics/artwork/src/basics.svg" height="100"></td>
|
||
<td>
|
||
<ul>
|
||
<li>Narrative-based code-along course</li>
|
||
<li>Independent on background/skill level, <br>
|
||
suitable for data management novices</li>
|
||
</ul>
|
||
</td>
|
||
</tr>
|
||
<tr>
|
||
<td><img src="../pics/artwork/src/usecases.svg" height="100"></td>
|
||
<td>
|
||
<ul>
|
||
<li>Step-by-step solutions to common <br>
|
||
data management problems, like<br />how to
|
||
make a reproducible paper</li>
|
||
</ul>
|
||
</td>
|
||
</tr>
|
||
</table>
|
||
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
|
||
<script src="../reveal.js/dist/reveal.js"></script>
|
||
<script src="../reveal.js/plugin/notes/notes.js"></script>
|
||
<script src="../reveal.js/plugin/markdown/markdown.js"></script>
|
||
<script src="../reveal.js/plugin/highlight/highlight.js"></script>
|
||
<script>
|
||
// More info about initialization & config:
|
||
// - https://revealjs.com/initialization/
|
||
// - https://revealjs.com/config/
|
||
Reveal.initialize({
|
||
hash: true,
|
||
// The "normal" size of the presentation, aspect ratio will be preserved
|
||
// when the presentation is scaled to fit different resolutions. Can be
|
||
// specified using percentage units.
|
||
width: 1280,
|
||
height: 960,
|
||
// Factor of the display size that should remain empty around the content
|
||
margin: 0.3,
|
||
// Bounds for smallest/largest possible scale to apply to content
|
||
minScale: 0.2,
|
||
maxScale: 1.0,
|
||
|
||
controls: true,
|
||
progress: true,
|
||
history: true,
|
||
center: true,
|
||
slideNumber: 'c',
|
||
pdfSeparateFragments: false,
|
||
pdfMaxPagesPerSlide: 1,
|
||
pdfPageHeightOffset: -1,
|
||
transition: 'slide', // none/fade/slide/convex/concave/zoom
|
||
// Learn about plugins: https://revealjs.com/plugins/
|
||
plugins: [ RevealMarkdown, RevealHighlight, RevealNotes ]
|
||
});
|
||
</script>
|
||
</body>
|
||
</html>
|