785 lines
31 KiB
HTML
785 lines
31 KiB
HTML
<!doctype html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
|
|
|
|
<!-- Edit me start! -->
|
|
<title>This is where your title goes</title>
|
|
<meta name="description" content=" This is where you put a short description ">
|
|
<meta name="author" content=" Your Name ">
|
|
<!-- Edit me end! -->
|
|
|
|
<link rel="stylesheet" href="../reveal.js/dist/reset.css">
|
|
<link rel="stylesheet" href="../reveal.js/dist/reveal.css">
|
|
<link rel="stylesheet" href="../reveal.js/dist/theme/beige.css">
|
|
|
|
<!-- Theme used for syntax highlighted code -->
|
|
<link rel="stylesheet" href="../reveal.js/plugin/highlight/monokai.css">
|
|
</head>
|
|
<body>
|
|
<div class="reveal">
|
|
<div class="slides">
|
|
|
|
<section>
|
|
<section>
|
|
<table style="border:none">
|
|
<tr>
|
|
<td><img style="height:150px;margin-bottom:30px" data-src="../pics/datalad_logo_wide.svg">
|
|
</td>
|
|
<td>
|
|
<h3>
|
|
Decentralized Management <br>of Digital Objects for Open Science</h3>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
|
|
<div style="margin-top:1em;text-align:center">
|
|
<table style="border: none;">
|
|
<tr>
|
|
<td>Adina Wagner
|
|
<br><small>
|
|
<a href="https://twitter.com/AdinaKrik" target="_blank">
|
|
<img data-src="../pics/twitter.png" style="height:30px;margin:0px" />
|
|
@AdinaKrik</a></small></td>
|
|
</tr>
|
|
<tr>
|
|
<td>
|
|
<small><a href="http://psychoinformatics.de" target="_blank">Psychoinformatics lab</a>,
|
|
<br> Institute of Neuroscience and
|
|
Medicine, Brain & Behavior (INM-7)<br>
|
|
Research Center Jülich</small><br>
|
|
</td>
|
|
<td></td>
|
|
<td><img style="height:70px;margin-right:10px" data-src="../pics/fzj_logo.svg" />
|
|
<br></td>
|
|
</tr>
|
|
</table>
|
|
</div>
|
|
<br><br>
|
|
<div>
|
|
<table>
|
|
<tr>
|
|
<td><img style="width:280px;margin-bottom:0px" src="../pics/dc2020.svg"></td>
|
|
<td><span class="rainbow">Debian in Arts & Science</span></td>
|
|
</tr> </table>
|
|
</div>
|
|
<small>
|
|
Slides: <a href="https://github.com/datalad-handbook/course/blob/master/talks/PDFs/DebConf.pdf" target="_blank">
|
|
https://github.com/datalad-handbook/course/</a></small></a>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Acknowledgements</h2>
|
|
<table>
|
|
<tr style="vertical-align:middle">
|
|
<td style="vertical-align:middle">
|
|
<dl>
|
|
<dt>Software</dt>
|
|
<dd style="margin-left:5px!important">
|
|
<ul style="margin-left:5px!important">
|
|
<li>Michael Hanke</li>
|
|
<li>Yaroslav Halchenko</li>
|
|
<li>Joey Hess (git-annex)</li>
|
|
<li>Kyle Meyer</li>
|
|
<li>Benjamin Poldrack</li>
|
|
<li><em>26 additional contributors</em></li>
|
|
</ul>
|
|
</dd>
|
|
<dt style="margin-top:20px">Documentation project </dt>
|
|
<dd style="margin-left:5px!important">
|
|
<ul style="margin-left:5px!important">
|
|
<li>Michael Hanke</li>
|
|
<li>Laura Waite</li>
|
|
<li><em>28 additional contributors</em></li>
|
|
</ul>
|
|
</dd>
|
|
</dl>
|
|
</td>
|
|
<td style="vertical-align:middle">
|
|
<div style="margin-bottom:-20px;text-align:center"><strong>Funders</strong></div>
|
|
<img style="height:150px;margin-right:50px" data-src="../pics/nsf_2020.png" />
|
|
<img style="height:150px;margin-right:50pxi;margin-left:50px" data-src="../pics/binc.png" />
|
|
<img style="height:150px;margin-left:50px" data-src="../pics/bmbf_2020.png" />
|
|
<br />
|
|
<img style="height:80px;margin-top:-40px;margin-left:auto;margin-right:auto;width:100%" data-src="../pics/fzj_logo.svg" />
|
|
<div style="margin-top:-20px">
|
|
<img style="height:60px;margin-right:20px" data-src="../pics/erdf.png" />
|
|
<img style="height:60px;margin-right:20px" data-src="../pics/cbbs_logo.png" />
|
|
<img style="height:60px" data-src="../pics/LSA-Logo.png" />
|
|
</div>
|
|
<div style="margin-top:40px;margin-bottom:20px;text-align:center"><strong>Collaborators</strong></div>
|
|
<div style="margin-top:-20px">
|
|
<img style="height:100px;margin:20px" data-src="../pics/hbp_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/conp_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/vbc_logo.png" />
|
|
</div>
|
|
<div style="margin-top:-40px">
|
|
<img style="height:120px;margin:20px" data-src="../pics/openneuro_logo.png" />
|
|
<img style="height:120px;margin:20px" data-src="../pics/cbrain_logo.png" />
|
|
<img style="height:140px;margin:20px" data-src="../pics/brainlife_logo.png" />
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
</section>
|
|
</section>
|
|
|
|
<!-- INTRODUCTION -->
|
|
|
|
<section>
|
|
<section data-transition="none">
|
|
<h3>Perks of being a neuroscientist...</h3>
|
|
<p class="fragment fade-in" data-fragment-index="1">A growing culture of open data
|
|
<img src="../pics/opendata.gif"> <imgcredit>https://en.wikipedia.org/wiki/List_of_neuroscience_databases</imgcredit></p>
|
|
</section>
|
|
|
|
<!--
|
|
<section data-transition="none">
|
|
<h3>Perks of being a (neuro)scientist...</h3>
|
|
<p>... with many open dataset of several TB of data!
|
|
<img src="../pics/bigneurodata.png">
|
|
<img src="../pics/bigneurodata2.png"> </p>
|
|
</section>
|
|
-->
|
|
|
|
<section data-transition="none">
|
|
<h3>Perks of being a neuroscientist...</h3>
|
|
<p> A large and growing amount of open source software
|
|
<table>
|
|
<tr style="vertical-align:middle">
|
|
<td style="vertical-align:middle">
|
|
<img style="height:100px;margin-right:50px" data-src="../pics/jupyter_logo.png" />
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/mindboggle_logo.png" />
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/psychopy_logo.png" />
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/fsl_logo.png" />
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/pydra_logo.png" />
|
|
<img style="height:100px;margin-right:50px" data-src="../pics/neurodebian.png" />
|
|
<br />
|
|
|
|
<div style="margin-top:-20px">
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/R_logo.png" />
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/rstudio-logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/afni_logo.png" />
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/mne-logo.png" />
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/sklearn_logo.png" />
|
|
<img style="height:100px;margin-left:50px" data-src="../pics/FSlogo.png" />
|
|
<img style="height:80px;margin-left:50px" data-src="../pics/pysurfer_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/dipy-logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/nilearn-logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/mrtrix-logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/giraffetools_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/mricron-logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/pandas_logo.png" />
|
|
</div>
|
|
<div style="margin-top:-20px">
|
|
|
|
<img style="height:100px;margin:20px" data-src="../pics/ants_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/brainiak_logo.svg" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/spm.svg" />
|
|
<img style="height:80px;margin:20px" data-src="../pics/cpac_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/fieldtrip_logo.png" />
|
|
</div>
|
|
<div style="margin-top:-40px">
|
|
<br><small>... and many more!</small>
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
</p>
|
|
</section>
|
|
|
|
|
|
<section data-transition="none">
|
|
<h3>Perks of being a neuroscientist...</h3>
|
|
<p>
|
|
Many readily available, often free, sometimes FOSS, services for data storage and collaboration<br>
|
|
<img src="../pics/services_only.png" height="750"> <imgcredit>https://www.incf.org/resources/sbps</imgcredit>
|
|
</p>
|
|
</section>
|
|
|
|
<!--
|
|
<section data-transition="none">
|
|
<h3>Perks of being a (neuro)scientist...</h3>
|
|
<p>Coordinated efforts towards common standards
|
|
<img src="../pics/incf_standards.png"> <imgcredit>https://www.incf.org/resources/sbps</imgcredit>
|
|
</p>
|
|
</section>
|
|
-->
|
|
|
|
<section data-transition="none">
|
|
<h3>Perks of being a neuroscientist...</h3>
|
|
Work on fascinating questions with fascinating data<br>
|
|
<table>
|
|
<tr>
|
|
<td style="border:0px">
|
|
<img height="300px" src="../pics/brainscan.gif"> <imgcredit><tiny>Dwayne Reed; commons.wikimedia.org<br>/wiki/File:Parasagittal_MRI_of_human_head_in<br>_patient_with_benign_familial_macrocephaly_prior<br>_to_brain_injury_(ANIMATED).gif</tiny></imgcredit>
|
|
</td>
|
|
<td style="border:0px">
|
|
<img height="300px" src="../pics/meg-data2.png"><imgcredit>https://mne.tools</imgcredit>
|
|
</td>
|
|
</tr>
|
|
<tr style="border:0px">
|
|
<td style="border:0px">
|
|
<img height="200px" src="../pics/meg-data.png"><imgcredit>https://mne.tools</imgcredit>
|
|
</td>
|
|
<td style="border:0px">
|
|
<img height="300px" src="../pics/tractography.gif"><imgcredit>http://trackvis.org</imgcredit>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>The good news, the bad news</h3>
|
|
<table>
|
|
<tr style="vertical-align:middle; font-size:35px" ></tr>
|
|
<tr>
|
|
<td style="border:0px" class="fragment fade-in-then-semi-out" data-fragment-index="1">
|
|
Data sharing</td>
|
|
<td style="border:0px" class="fragment fade-in" data-fragment-index="2">
|
|
Heterogenous distribution and updating, many scientists lack data management skills</td>
|
|
</tr>
|
|
<tr>
|
|
<td style="border:0px" class="fragment fade-in-then-semi-out" data-fragment-index="3">
|
|
Patient data</td>
|
|
<td style="border:0px" class="fragment fade-in" data-fragment-index="4">
|
|
... that's sparse & subject to strict data protection, making sharing difficult</td>
|
|
</tr>
|
|
<tr>
|
|
<td style="border:0px" class="fragment fade-in-then-semi-out" data-fragment-index="5">
|
|
Data analysis</td>
|
|
<td style="border:0px" class="fragment fade-in" data-fragment-index="6">
|
|
Reproducibility is threatened by intransparent, multi-stepped analyses¹ & unstable results across software versions²</td>
|
|
</tr>
|
|
<tr>
|
|
<td style="border:0px" class="fragment fade-in-then-semi-out" data-fragment-index="7">Collaboration</td>
|
|
<td style="border:0px" class="fragment fade-in" data-fragment-index="8">Few interoperable workflows across institutes, rather: isolated solutions</td>
|
|
</tr>
|
|
<!--
|
|
<tr>
|
|
<td class="fragment fade-in-then-semi-out" data-fragment-index="7">Standards towards consistent file organization, description, and interoperability </td>
|
|
<td class="fragment fade-in" data-fragment-index="8">Standards aren't static and evolve over time, thus tools and data need updates</td>
|
|
</tr>
|
|
<-->
|
|
</table>
|
|
<br>
|
|
<div class="fragment fade-in">General underlying difficulties (distribution, updates, management, interoperability, reproducibility) also exist(ed) for software - <br> what can we learn?</div>
|
|
<small>
|
|
<div class="fragment fade-in-then-semi-out" data-fragment-index="6" align="left">
|
|
¹ <a href="https://doi.org/10.1038/s41586-020-2314-9" target="_blank">Botvinik-Nezer et al., 2020: Variability in the analysis of a single neuroimaging dataset by many teams </a><br>
|
|
² <a href="https://doi.org/10.1177%2F1094342020926237" target="_blank"> Kiar et al., 2020: Comparing perturbation models for evaluating stability of neuroimaging pipelines </a>
|
|
</div>
|
|
</small>
|
|
</section>
|
|
|
|
<section>
|
|
<h3>improve scientific workflows, coming from the perspective of software distributions</h3>
|
|
<img class="fragment fade-in" src="../pics/datalad_logo_wide.svg">
|
|
<div class="fragment fade-in">"Share and treat data like software"</div>
|
|
<img class="fragment fade-in" src="../pics/yarik_michael.jpg">
|
|
<img class="fragment fade-in" src="../pics/neurodebian.png" height="250">
|
|
<img class="fragment fade-in" src="../pics/joey-hess.jpg" height="250">
|
|
<img class="fragment fade-in" src="../pics/gitannex.png" height="250">
|
|
</section>
|
|
|
|
<!--
|
|
<section data-transition="None">
|
|
<h2>Rethinking data</h2>
|
|
<ul class="fragment fade-in">
|
|
<li>Just like code, <b>data is not static</b>, and should be versioned</li>
|
|
<li>Just like code, <b>data is subject to collaboration</b>,
|
|
and workflows as in open source development should be possible</li>
|
|
<li>Provenance of data is essential for reproducibility </li>
|
|
</ul>
|
|
<img class="fragment fade-in" src="../pics/phd052810s.gif">
|
|
|
|
</section>
|
|
<section data-transition="None">
|
|
<h2>Rethinking data</h2>
|
|
<ul>
|
|
<li>Just like code, <b>data is subject to collaboration</b>. It should be (publicly)
|
|
sharable, and also update-able, with generic workflows similar to those in
|
|
open-source software development</li>
|
|
</ul>
|
|
<img height="450" class="fragment fade-in" src="../pics/collaboration13.svg">
|
|
|
|
</section>
|
|
-->
|
|
</section>
|
|
</section>
|
|
|
|
<!-- General datalad functions -->
|
|
<section>
|
|
<section data-markdown><script type="text/template">
|
|
## DataLad
|
|
|
|
- DataLad: Joint management of digital objects through their entire life cycle
|
|
|
|
- Version control
|
|
- Transport logistics
|
|
- Interoperability
|
|
- Provenance capture
|
|
|
|
- Basics
|
|
|
|
- Built on top of Git and [git-annex](https://git-annex.branchable.com/) (Joey Hess)
|
|
- Free and open source Python software (MIT license)
|
|
- Python API and command line interface
|
|
|
|
</script>
|
|
<aside class="notes">
|
|
- modular management
|
|
- data provenance
|
|
- discoverability
|
|
</aside>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Version Control</h2>
|
|
|
|
<ul>
|
|
<li>DataLad knows two things: Datasets and files</li>
|
|
<img class="fragment fade-in" data-fragment-index="1" style="box-shadow: 5px 5px 3px #888888" src="../pics/artwork/src/dataset.svg" height="330"> <img style="box-shadow: 5px 5px 3px #888888" height="330" class="fragment fade-in" data-fragment-index="2" src="../pics/artwork/src/local_wf.svg">
|
|
<li class="fragment fade-in" data-fragment-index="3">A DataLad dataset is an Git repository:</li>
|
|
<ul class="fragment fade-in" data-fragment-index="3">
|
|
<li>Content and domain agnostic</li>
|
|
<li>Minimization of custom procedures or data structures (<b>user must not lose data or data access if DataLad vanishes)</b></li>
|
|
<li><b>Uncomprimised decentralization</b></li>
|
|
</ul>
|
|
|
|
<!--<img class="fragment fade-in" style="box-shadow: 5px 5px 3px #888888" height="330" src="../pics/artwork/src/collaboration.svg">-->
|
|
</ul>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Version Control: Data</h2>
|
|
|
|
<ul>
|
|
<li class="fragment fade-in-then-semi-out">Datasets have an optional annex for (large or sensitive) data (or text/code). </li>
|
|
<li class="fragment fade-in-then-semi-out">Identity (hash) and location information is put
|
|
into Git, rather than file content. The annex, and transport to and from
|
|
it is managed with <b>git-annex</b>
|
|
(<a href="https://git-annex.branchable.com" target="_blank">git-annex.branchable.com</a>) <br>
|
|
→ decentralized version control for files of any size.</li>
|
|
<li class="fragment fade-in-then-semi-out">DataLad works towards wrapping Git and git-annex into a non-complex core-API
|
|
(helpful for data management novices).</li>
|
|
</ul>
|
|
<img height="330" class="fragment fade-in" data-fragment-index="1" src="../pics/artwork/src/local_wf.svg">
|
|
<ul>
|
|
<li class="fragment fade-in">Flexibility and commands of Git and git-annex are preserved (useful for experienced Git/git-annex users).</li>
|
|
</ul>
|
|
</section>
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>Version Control: Nesting</h2>
|
|
|
|
<ul>
|
|
<li>Seamless nesting mechanisms:
|
|
<img height="330" src="../pics/artwork/src/linkage_subds.svg">
|
|
<ul>
|
|
<li>hierarchies of datasets in super-/sub-dataset relationships</li>
|
|
<li>based on Git submodules, but more seamless</li>
|
|
</ul>
|
|
<li class="fragment fade-in" data-fragment-index="2">Overcomes scaling issues with large amounts of files</li>
|
|
<pre class="fragment fade-in" data-fragment-index="2"><code>adina@bulk1 in /ds/hcp/super on git:master❱ datalad status --annex -r
|
|
15530572 annex'd files (77.9 TB recorded total size)
|
|
nothing to save, working tree clean</code></pre>
|
|
<small><a class="fragment fade-in" data-fragment-index="2" href="https://github.com/datalad-datasets/human-connectome-project-openaccess" target="_blank">(github.com/datalad-datasets/human-connectome-project-openaccess)</a></small>
|
|
<li class="fragment fade-in">Modularizes research components for transparency, reuse, and access management</li>
|
|
</ul>
|
|
|
|
|
|
|
|
<aside class="notes">
|
|
Two advantages:
|
|
<ul>
|
|
<li>Scalable, size-independent version control</li>
|
|
<li>Modularization of research components to increase transparency
|
|
and aid component reuse, as individual components can be flexibly
|
|
puzzled together into new research objects, while being uniquely identified and versioned</li>
|
|
</ul>
|
|
|
|
At this point: Fixed data management, layed a foundation for updating data
|
|
</aside>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Transport logistics</h2>
|
|
<ul>
|
|
<li>Share data like source code</li>
|
|
<li class="fragment fade-in-then-semi-out" data-fragment-index="1">Datasets can be cloned, pushed, and updated from and to local paths,
|
|
remote hosting services, external special remotes</li>
|
|
</ul>
|
|
<img class="fragment fade-in" data-fragment-index="1" style="box-shadow: 5px 5px 3px #888888" height="330" src="../pics/artwork/src/collaboration.svg">
|
|
<ul>
|
|
<li class="fragment fade-in" data-fragment-index="2">Flexible data access management for annexed file contents based on storage location</li>
|
|
</ul>
|
|
<aside class="notes">
|
|
Idea behind datalad: Enable a similar level of tooling and culture for the distribution and version control of data as it is present for open source software development
|
|
</aside>
|
|
</section>
|
|
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>Transport logistics</h2>
|
|
<ul>
|
|
<li class="fragment fade-in-then-semi-out">Disk-space aware workflows: Cloned datasets are lean (only Git):</li>
|
|
<pre class="fragment fade-in"><code>$ datalad clone git@github.com:datalad-datasets/machinelearning-books.git
|
|
install(ok): /tmp/machinelearning-books (dataset)
|
|
$ cd machinelearning-books && du -sh
|
|
348K .</code></pre>
|
|
<pre class="fragment fade-in"><code>$ ls
|
|
A.Shashua-Introduction_to_Machine_Learning.pdf
|
|
B.Efron_T.Hastie-Computer_Age_Statistical_Inference.pdf
|
|
C.E.Rasmussen_C.K.I.Williams-Gaussian_Processes_for_Machine_Learning.pdf
|
|
D.Barber-Bayesian_Reasoning_and_Machine_Learning.pdf
|
|
[...]</code></pre>
|
|
<li class="fragment fade-in-then-semi-out"> annexed file's contents can
|
|
be retrieved & dropped on demand:</li>
|
|
</ul>
|
|
<pre class="fragment fade-in"><code>$ datalad get A.Shashua-Introduction_to_Machine_Learning.pdf
|
|
get(ok): /tmp/machinelearning-books/A.Shashua-Introduction_to_Machine_Learning.pdf (file) [from web...]</code></pre>
|
|
<pre class="fragment fade-in-then-semi-out"><code>$ datalad drop A.Shashua-Introduction_to_Machine_Learning.pdf
|
|
drop(ok): /tmp/machinelearning-books/A.Shashua-Introduction_to_Machine_Learning.pdf (file) [checking https://arxiv.org/pdf/0904.3664v1.pdf...]</code></pre>
|
|
|
|
<aside class="notes">
|
|
Idea behind datalad: Enable a similar level of tooling and culture for the distribution and version control of data as it is present for open source software development
|
|
</aside>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Interoperability</h2>
|
|
<ul>
|
|
<li>DataLad is built to maximize interoperability and use with hosting and
|
|
storage technology</li>
|
|
</ul>
|
|
<img class="fragment fade-in" src="../pics/services_only.png" height="650">
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Interoperability</h2>
|
|
<ul>
|
|
<li>DataLad is built to maximize interoperability and use with hosting and
|
|
storage technology</li>
|
|
</ul>
|
|
<img src="../pics/services_connected.png" height="650">
|
|
</section>
|
|
<!--
|
|
<section data-transition="None">
|
|
<h2>Interoperability</h2>
|
|
<ul>
|
|
<li>DataLad is built to maximize interoperability and use with hosting and
|
|
storage technology</li>
|
|
</ul>
|
|
<a href="https://github.com/psychoinformatics-de/paper-remodnav/" target="blank"> <img src="../pics/remodnavpaper.png">
|
|
</a>
|
|
</section>
|
|
-->
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>Provenance capture</h2>
|
|
<ul>
|
|
<li>Datasets can capture dataset <b>transformations</b> and their <b>cause</b> in order
|
|
to track the entire evolution and lineage of files in datasets</li>
|
|
</ul>
|
|
<img src="../pics/w3cprov.png" width="700">
|
|
<ul>
|
|
<li>"How did this file came to be?",
|
|
"What steps were undertaken to transform the raw data into the published result?",
|
|
"Can you recompute this for me?"
|
|
</li>
|
|
</ul>
|
|
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Provenance capture</h2>
|
|
<ul>
|
|
<li><b>Basic provenance</b>: DataLad can capture arbitrary dataset
|
|
transformations (e.g., from computing analysis results) and record
|
|
the cause of such a change
|
|
</li>
|
|
<pre><code class="bash" style="max-height:none">$ datalad run -m "Perform eye movement event detection"\
|
|
--input 'raw_data/*.tsv.gz' --output 'sub-*' \
|
|
bash code/compute_all.sh
|
|
|
|
-- Git commit -- Michael Hanke < ... @gmail.com>; Fri Sep 21 22:00:47 2019
|
|
[DATALAD RUNCMD] Perform eye movement event detection
|
|
=== Do not change lines below ===
|
|
{
|
|
"cmd": "bash code/compute_all.sh",
|
|
"dsid": "d2b4b72a-7c13-11e7-9f1f-a0369f7c647e",
|
|
"exit": 0,
|
|
"inputs": ["raw_data/*.tsv.gz"],
|
|
"outputs": ["sub-*"],
|
|
"pwd": "."
|
|
}
|
|
^^^ Do not change lines above ^^^
|
|
---
|
|
sub-01/sub-01_task-movie_run-1_events.png | 2 +-
|
|
sub-01/sub-01_task-movie_run-1_events.tsv | 2 +-
|
|
...</code></pre>
|
|
</ul>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Provenance capture</h2>
|
|
<ul>
|
|
<li><b>Computational provenance</b>: Datasets can track <b>software containers</b>,
|
|
and perform and record computations inside it:
|
|
</li>
|
|
<pre><code class="bash" style="max-height:none">$ datalad containers-run -n neuroimaging-container \
|
|
--input 'mri/*_bold.nii --output 'sub-*/LC_timeseries_run-*.csv' \
|
|
"bash -c 'for sub in sub-*; do for run in run-1 ... run-8;
|
|
do python3 code/extract_lc_timeseries.py \$sub \$run; done; done'"
|
|
|
|
-- Git commit -- Michael Hanke < ... @gmail.com>; Fri Jul 6 11:02:28 2019
|
|
[DATALAD RUNCMD] singularity exec --bind {pwd} .datalad/e...
|
|
=== Do not change lines below ===
|
|
{
|
|
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
|
|
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
|
|
"inputs": [
|
|
"mri/*.bold.nii.gz",
|
|
".datalad/environments/nilearn.simg"
|
|
],
|
|
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
|
|
...
|
|
}
|
|
^^^ Do not change lines above ^^^
|
|
---
|
|
sub-01/LC_timeseries_run-1.csv | 1 +
|
|
...</code></pre>
|
|
</ul>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Provenance capture</h2>
|
|
<ul>
|
|
<li>All recorded transformations can be re-computed automatically</li>
|
|
<pre><code class="bash" style="max-height:none">$ datalad rerun eee1356bb7e8f921174e404c6df6aadcc1f158f0
|
|
[INFO] == Command start (output follows) =====
|
|
[INFO] == Command exit (modification check follows) =====
|
|
add(ok): sub-01/LC_timeseries_run-1.csv (file)
|
|
...
|
|
save(ok): . (dataset)
|
|
action summary:
|
|
add (ok: 45)
|
|
save (notneeded: 45, ok: 1)
|
|
unlock (notneeded: 45)
|
|
...</code></pre>
|
|
|
|
<ul>
|
|
<li>Aid with the reproducibility of a result and verify it (via content hash)</li>
|
|
<li>Use complete capture and automatic re-computation as alternative to storage and transport</li>
|
|
</li></li>
|
|
</ul>
|
|
|
|
</ul>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<!-- Sneak peak into additional functionality --->
|
|
|
|
<section>
|
|
<h2>And there is more...</h2>
|
|
</section>
|
|
|
|
<section data-markdown><script type="text/template">
|
|
## Scalable and Actionable (meta)data representations
|
|
<!-- .element: height="400" style="margin-top:-10px;margin-bottom:-30px" -->
|
|
- meta data logistics to generate, store, share, and search arbitrary meta data
|
|
- Sharing/retrieving only dataset meta data enables data discovery that doesn't require
|
|
retrieving (potentially large) file contents first
|
|
- facilitates building metadata-driven applications
|
|
|
|
|
|
<aside class="notes">
|
|
- be able to: pick something that works now, and be able to transition
|
|
to something better
|
|
- principle: programmatic (re-)extraction of metadata
|
|
- datalad provides metadata transport
|
|
</aside>
|
|
</script>
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Dealing with sensitive data</h2>
|
|
<ul>
|
|
<li>Flexible data access management - file contents can be made available
|
|
to none or selected few</li>
|
|
<li>Share anonymized meta data not subject to privacy concerns
|
|
</li>
|
|
<li>Viable solution to bring the computation to the data</li>
|
|
<img src="../pics/artwork/src/hospital.png">
|
|
</ul>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h2>Still more...</h2>
|
|
<dl>
|
|
<dt>"RIA stores" for central data management, archival, or backup:</dt>
|
|
<dd>
|
|
self-contained, plain file system storage for datasets, tuned for
|
|
maintainability and use on systems with inode limitations (any-sized
|
|
dataset ≈ 25 inodes). More: <a href="https://handbook.datalad.org/en/latest/r.html?RIA" target="_blank">
|
|
handbook.datalad.org/r.html?RIA
|
|
</a>
|
|
</dd>
|
|
<dt>Extensions with additional or domain-specific functionality:</dt>
|
|
<dd>Available as pip-installable Python packages. More: <a href="https://handbook.datalad.org/en/latest/r.html?extensions" target="_blank">
|
|
handbook.datalad.org/r.html?extensions
|
|
</a> </dd>
|
|
<dt>Open Data collection:</dt>
|
|
<dd>As of August 2020, about 250TB of open data: <a href="http://datasets.datalad.org/" target="_blank">
|
|
datasets.datalad.org/
|
|
</a> </dd>
|
|
</dl>
|
|
</section>
|
|
</section>
|
|
|
|
<section>
|
|
<section>
|
|
<h2>{Open,Transparent,Reproducible} science</h2>
|
|
|
|
<ul>
|
|
<li class="fragment fade-in-then-semi-out"> Treat data like software: obtain, version, share, and update data</li>
|
|
<li class="fragment fade-in-then-semi-out"> Simplified data management, disk-space aware storage & computing</li>
|
|
<li class="fragment fade-in-then-semi-out"> Transparent and reproducible science: link code, data, software, and execution
|
|
in a human- and machine-readable way</li>
|
|
<li class="fragment fade-in-then-semi-out"> Collaborate: Generic workflows, interoperabality with established tools & services</li>
|
|
<!-- <li class="fragment fade-in-then-semi-out"> Publish and share data, but retain complete control over storage and access</li> -->
|
|
|
|
</ul>
|
|
</section>
|
|
</section>
|
|
|
|
<section>
|
|
<section>
|
|
<h2>Find out more</h2>
|
|
<table>
|
|
<tr>
|
|
<td>
|
|
Comprehensive user documentation in the<br>
|
|
DataLad Handbook
|
|
<a href="http://handbook.datalad.org">(handbook.datalad.org)</a>
|
|
</td>
|
|
<td>
|
|
<img src="../pics/logo.svg" height="150">
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<table>
|
|
<tr>
|
|
<td><img src="../pics/artwork/src/enter.svg" height="100"></a></td>
|
|
<td>
|
|
<ul>
|
|
<li>High-level function/command overviews, <br>
|
|
Installation, Configuration, Cheatsheet</li>
|
|
</ul>
|
|
</td>
|
|
</tr>
|
|
<tr>
|
|
<td><img src="../pics/artwork/src/basics.svg" height="100"></td>
|
|
<td>
|
|
<ul>
|
|
<li>Narrative-based code-along course</li>
|
|
<li>Independent on background/skill level, <br>
|
|
suitable for data management novices</li>
|
|
</ul>
|
|
</td>
|
|
</tr>
|
|
<tr>
|
|
<td><img src="../pics/artwork/src/usecases.svg" height="100"></td>
|
|
<td>
|
|
<ul>
|
|
<li>Step-by-step solutions to common <br>
|
|
data management problems, like<br />how to
|
|
make a reproducible paper</li>
|
|
</ul>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<aside class="notes">
|
|
- what is in it?
|
|
- how is it structured?
|
|
- who and what is it aiming for?
|
|
- show "big picture" figure
|
|
- claim data management demands of science map well onto datalad functionality
|
|
- summarize remaining principles (obsoletion insurance, etc.)
|
|
</aside>
|
|
</script>
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Further Information</h2>
|
|
<ul>
|
|
<li>Source code: <a href="https://github.com/datalad/datalad">github.com/datalad/datalad</a> </li>
|
|
<li>Technical docs: <a href="https://docs.datalad.org">docs.datalad.org</a></li>
|
|
<li>Video tutorials: <a href="https://www.youtube.com/channel/UCB8-Zf7D0DSzAsREoIt0Bvw">Youtube channel "DataLad"</a> </li>
|
|
<li>Matrix channel: <a href="https://matrix.to/#/!SaWRuXhTcCDulfttET:matrix.org?via=matrix.org&via=inm7.de" target="_blank">DataLad</a> </li>
|
|
</ul>
|
|
<br>
|
|
<br>
|
|
Slides: <a href="https://github.com/datalad-handbook/course/blob/master/talks/PDFs/DebConf.pdf" target="_blank">
|
|
https://github.com/datalad-handbook/course/</a>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
<h1>Thanks!</h1>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
</div>
|
|
</div>
|
|
|
|
<script src="../reveal.js/dist/reveal.js"></script>
|
|
<script src="../reveal.js/plugin/notes/notes.js"></script>
|
|
<script src="../reveal.js/plugin/markdown/markdown.js"></script>
|
|
<script src="../reveal.js/plugin/highlight/highlight.js"></script>
|
|
<script>
|
|
// More info about initialization & config:
|
|
// - https://revealjs.com/initialization/
|
|
// - https://revealjs.com/config/
|
|
Reveal.initialize({
|
|
hash: true,
|
|
// The "normal" size of the presentation, aspect ratio will be preserved
|
|
// when the presentation is scaled to fit different resolutions. Can be
|
|
// specified using percentage units.
|
|
width: 1280,
|
|
height: 960,
|
|
// Factor of the display size that should remain empty around the content
|
|
margin: 0.3,
|
|
// Bounds for smallest/largest possible scale to apply to content
|
|
minScale: 0.2,
|
|
maxScale: 1.0,
|
|
|
|
controls: true,
|
|
progress: true,
|
|
history: true,
|
|
center: true,
|
|
slideNumber: 'c',
|
|
pdfSeparateFragments: false,
|
|
pdfMaxPagesPerSlide: 1,
|
|
pdfPageHeightOffset: -1,
|
|
transition: 'slide', // none/fade/slide/convex/concave/zoom
|
|
// Learn about plugins: https://revealjs.com/plugins/
|
|
plugins: [ RevealMarkdown, RevealHighlight, RevealNotes ]
|
|
});
|
|
</script>
|
|
</body>
|
|
</html>
|