749 lines
33 KiB
HTML
749 lines
33 KiB
HTML
<!doctype html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
|
|
|
|
<!-- Edit me start! -->
|
|
<title>DataLad</title>
|
|
<meta name="description" content=" DataLad ">
|
|
<meta name="author" content=" Adina Wagner ">
|
|
<!-- Edit me end! -->
|
|
|
|
<link rel="stylesheet" href="../reveal.js/dist/reset.css">
|
|
<link rel="stylesheet" href="../reveal.js/dist/reveal.css">
|
|
<link rel="stylesheet" href="../reveal.js/dist/theme/beige.css">
|
|
<link rel="stylesheet" href="../css/main.css">
|
|
<!-- Theme used for syntax highlighted code -->
|
|
<link rel="stylesheet" href="../reveal.js/plugin/highlight/monokai.css">
|
|
</head>
|
|
<body>
|
|
<div class="reveal">
|
|
<div class="slides">
|
|
|
|
|
|
<section>
|
|
<section>
|
|
<br>
|
|
<br>
|
|
<table style="border:none">
|
|
<tr>
|
|
<td style="vertical-align:top;"><img style="height:150px;margin-bottom:30px" data-src="../pics/datalad_logo_wide.svg">
|
|
</td>
|
|
<td>
|
|
<h2>
|
|
Data Management <br> for Open Science</h2>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<br><br>
|
|
<div style="margin-top:1em;text-align:center">
|
|
<table style="border: none;">
|
|
<tr style="border:none;">
|
|
<td style="vertical-align:top; border:none;">Dr. Adina Wagner
|
|
<br><small>
|
|
<a href="https://mas.to/@adswa" target="_blank">
|
|
<img data-src="../pics/mastodon.svg" style="height:30px;margin:0px" />
|
|
mas.to/@adswa</a></small></td>
|
|
<td style="vertical-align:top; border:none"><img style="height:100px;margin-right:10px" data-src="../pics/fzj_logo.svg" />
|
|
<br></td>
|
|
</tr>
|
|
<tr style="border:none;vertical-align:top;">
|
|
<td style="vertical-align:top;border:none;">
|
|
<small>Institute of Neuroscience and
|
|
Medicine, Brain & Behavior (INM-7)<br>
|
|
Research Center Jülich<br></small>
|
|
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
</div>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:0px;margin-bottom:100px;margin-left:1000px">
|
|
<img src="../pics/neuraltraces.png" height="200">
|
|
</p>
|
|
<br><small>
|
|
DOI: <a href="https://doi.org/10.5281/zenodo.10869053" target="_blank">doi.org/10.5281/zenodo.10869053</a> <br>
|
|
Slides: <a href="https://files.inm7.de/adina/talks/html/neuraltraces.html#/" target="_blank">files.inm7.de/adina/talks/html/neuraltraces.html</a>
|
|
</small>
|
|
</a>
|
|
</section>
|
|
</section>
|
|
|
|
<section>
|
|
<section>
|
|
<h2>Research data management?</h2>
|
|
<div class="r-stack">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="0" src="../pics/turingway_rdm.png">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="1" width="1000" src="../pics/RDM.gif">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="2" height="700" src="../pics/stolenlaptop.jpg">
|
|
<p> <img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/theverge_excel_genetics_screen1.png" style="box-shadow: 10px 10px 8px #888888;height=200px" height="200"><br>
|
|
<img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/theverge_excel_genetics_screen2.gif" style="box-shadow: 10px 10px 8px #888888;height=300px" height="300"><br>
|
|
<img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/theverge_excel_genetics_screen3.png" style="box-shadow: 10px 10px 8px #888888;height=200px" height="200"><br>
|
|
</p>
|
|
<p> <img class="fragment fade-in-then-out" data-fragment-index="4" src="../pics/guardian_excel_corona_screen1.png" style="box-shadow: 10px 10px 8px #888888;height=200px" height="200"><br>
|
|
<img class="fragment fade-in-then-out" data-fragment-index="4" src="../pics/guardian_excel_corona_screen2.png" style="box-shadow: 10px 10px 8px #888888;height=300px" height="300"><br>
|
|
</p>
|
|
<img class="fragment fade-in" data-fragment-index="5" src="../pics/frontend_vs_backend_paper.png" style="box-shadow: 10px 10px 8px #888888;height=600px">
|
|
</div>
|
|
<p class="fragment fade-in-then-out" data-fragment-index="2">
|
|
<imgcredit> adapted from https://dribbble.com/shots/3090048-Front-end-vs-Back-end</imgcredit>
|
|
</p>
|
|
<small class="fragment fade-in-then-out" data-fragment-index="3"><a href="https://www.theverge.com/2020/8/6/21355674/human-genes-rename-microsoft-excel-misreading-dates" target="_blank">
|
|
www.theverge.com/2020/8/6/21355674/human-genes-rename-microsoft-excel-misreading-dates</a></small>
|
|
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<img style="height:150px;margin-bottom:30px" data-src="../pics/datalad_logo_wide.svg"><br>
|
|
<ul style="font-size:37px">
|
|
<li>Domain-agnostic <strong>command-line tool</strong> (+ <strong>graphical user interface</strong>),
|
|
built on top of <a href="https://git-scm.com/" target="_blank">Git</a> <img style="vertical-align:middle" src="../pics/git.png" height="50px">
|
|
& <a href="https://git-annex.branchable.com/" target="_blank">Git-annex</a><img style="vertical-align:middle" src="../pics/gitannex.png" height="70px"></li>
|
|
<li>10+ year open source project (100+ contributors), available for all major OS</li>
|
|
<li>Major features:</li>
|
|
<dt>Version-controlling arbitrarily large content </dt>
|
|
<dd>Version control data & software alongside to code!</dd>
|
|
<dt>Transport mechanisms for sharing, updating & obtaining data </dt>
|
|
<dd>Consume & collaborate on data (analyses) like software</dd>
|
|
<dt>(Computationally) reproducible data analysis</dt>
|
|
<dd>Track and share provenance of all digital objects</dd>
|
|
<dt>(... and <i>much</i> more) </dt>
|
|
<br>
|
|
</ul>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<section data-transition="None">
|
|
<dl style="font-size:40px">
|
|
<dt>The building blocks of a scientific result are rarely static</dt>
|
|
<table>
|
|
<tr>
|
|
<td style="vertical-align:middle">Analysis code, manuscripts, ... evolve<br>
|
|
<small>(Rewrite, fix bugs, add functions,
|
|
refactor, extend, ...)</small></td>
|
|
<td>
|
|
<img src="../pics/final.png" height="500">
|
|
<imgcredit>Based on Piled Higher and Deeper
|
|
<a href="https://phdcomics.com/comics/archive_print.php?comicid=1531" target="_blank">
|
|
1531
|
|
</a> </imgcredit></td>
|
|
</tr>
|
|
</table>
|
|
</dl>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<dl style="font-size:40px">
|
|
<dt>The building blocks of a scientific result are rarely static</dt>
|
|
<table>
|
|
<tr>
|
|
<td style="vertical-align:middle"><strong>Data</strong> changes, too <br>
|
|
<small>(errors are fixed, data is extended,<br>
|
|
naming standards change, an analysis <br>
|
|
requires only a subset of your data...)</small></td>
|
|
<td>
|
|
<div class="r-stack">
|
|
<img data-fragment-index="1" src="../pics/phd052810s.png" height="500">
|
|
</div>
|
|
<imgcredit>Piled Higher and Deeper
|
|
<a href="https://phdcomics.com/comics/archive_print.php?comicid=1323" target="_blank">
|
|
1323
|
|
</a> </imgcredit></td>
|
|
</tr>
|
|
</table>
|
|
</dl>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Version control - beyond text files</h2>
|
|
<table>
|
|
<tr>
|
|
<td style="vertical-align:top">
|
|
<img src="../pics/turingway/ProjectHistory.png" width="600">
|
|
<imgcredit><a href="https://the-turing-way.netlify.app/reproducible-research/vcs/vcs-data.html" target="_blank">
|
|
CC-BY Scriberia & The Turing Way</a>
|
|
</imgcredit>
|
|
</td>
|
|
<td style="vertical-align:middle">
|
|
<ul style="font-size:35px">
|
|
<li>keep things organized</li>
|
|
<li>keep track of changes</li>
|
|
<li>revert changes or go <br>
|
|
back to previous states</li>
|
|
<li>collect and share digital provenance</li>
|
|
<li>industry standard: Git</li>
|
|
</ul>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
<p class="fragment fade-in" data-fragment-index="2">
|
|
Building up on Git and git-annex, DataLad version controls <strong>any</strong> data
|
|
</p>
|
|
<div class="r-stack">
|
|
<img class="fragment fade-in" height="500" data-fragment-index="3" src="../pics/tigdata.png">
|
|
<img class="fragment fade-in" height="500" data-fragment-index="4" src="../pics/tigdata3.png">
|
|
<img class="fragment fade-in" height="500" data-fragment-index="5" src="../pics/tigdata2.png">
|
|
</div>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Version Control</h2>
|
|
|
|
<ul>
|
|
<li>DataLad knows two things: Datasets and files</li>
|
|
<li class="fragment fade-in" data-fragment-index="3">A DataLad dataset is an Git repository:</li>
|
|
<ul class="fragment fade-in" data-fragment-index="3">
|
|
<li>Content and domain agnostic</li>
|
|
<li>Minimization of custom procedures or data structures (<b>user must not lose data or data access if DataLad vanishes)</b></li>
|
|
<li><b>Uncomprimised decentralization</b></li>
|
|
</ul></ul>
|
|
<img src="../pics/artwork/src/dataset.svg" height="330">
|
|
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Version Control: Data</h2>
|
|
|
|
<ul>
|
|
<li class="fragment fade-in-then-semi-out"data-fragment-index="1" >Datasets have an optional annex for (large or sensitive) data (or text/code). </li>
|
|
<li class="fragment fade-in-then-semi-out" data-fragment-index="1">Identity (hash) and location information is put
|
|
into Git, rather than file content. The annex, and transport to and from
|
|
it is managed with git-annex <b>→ decentralized version control for files of any size.</b></li>
|
|
<li class="fragment fade-in-then-semi-out" data-fragment-index="2">DataLad works towards wrapping Git and git-annex into a non-complex core-API
|
|
(helpful for data management novices).</li>
|
|
</ul>
|
|
<img height="330" data-fragment-index="2" class="fragment fade-in" src="../pics/artwork/src/local_wf.svg">
|
|
<ul>
|
|
<li class="fragment fade-in" data-fragment-index="2">Flexibility and commands of Git and git-annex are preserved (useful for experienced Git/git-annex users).</li>
|
|
</ul>
|
|
<small><p class="fragment" data-fragment-index="1">Delineation and advantages of decentral versus central RDM:<a href="https://doi.org/10.1515/nf-2020-0037" target="_blank">
|
|
Hanke et al., (2021). In defense of decentralized research data management</a></small>
|
|
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Version Control: Nesting</h2>
|
|
|
|
<ul>
|
|
<li>Seamless nesting mechanisms:
|
|
<img height="330" src="../pics/artwork/src/linkage_subds.svg">
|
|
<ul>
|
|
<li>hierarchies of datasets in super-/sub-dataset relationships</li>
|
|
<li>based on Git submodules, but more seamless</li>
|
|
</ul>
|
|
<li class="fragment fade-in" data-fragment-index="2">Overcomes scaling issues with large amounts of files</li>
|
|
<pre class="fragment fade-in" data-fragment-index="2"><code>adina@bulk1 in /ds/hcp/super on git:master❱ datalad status --annex -r
|
|
15530572 annex'd files (77.9 TB recorded total size)
|
|
nothing to save, working tree clean</code></pre>
|
|
<small><a class="fragment fade-in" data-fragment-index="2" href="https://github.com/datalad-datasets/human-connectome-project-openaccess" target="_blank">(github.com/datalad-datasets/human-connectome-project-openaccess)</a></small>
|
|
<li class="fragment fade-in">Modularizes research components for transparency, reuse, and access management</li>
|
|
</ul>
|
|
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Use DataLad for ...</h2>
|
|
<ul>
|
|
<li>... self-descriptive, reusable projects</li>
|
|
</ul>
|
|
<div class="r-stack">
|
|
<img height="800" class="fragment fade-in-then-out" data-fragment-index="0" src="../pics/remodnavpaper_2.gif" alt="a screenrecording of cloning a reproducible paper linking code and data from github">
|
|
</div>
|
|
<small><a href="https://github.com/psychoinformatics-de/paper-remodnav" target="_blank">github.com/psychoinformatics-de/paper-remodnav</a> </small>
|
|
</section>
|
|
|
|
</section>
|
|
|
|
|
|
|
|
<section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Use DataLad to ...</h2>
|
|
<ul>
|
|
<li>... share and consume data like source code</li>
|
|
</ul>
|
|
<div class="r-stack">
|
|
<img height="700" class="fragment fade-in-then-out" data-fragment-index="0" src="../pics/getdata_studyforrest.gif" alt="a screenrecording of cloning studyforrest data from github">
|
|
</div>
|
|
<aside class="notes">
|
|
Idea behind datalad: Enable a similar level of tooling and culture for the distribution and version control of data as it is present for open source software development
|
|
</aside>
|
|
</section>
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>Publishing datasets</h2>
|
|
<ul>
|
|
<li>The best of both worlds: Publish to Git hosting services, storage providers, or both</li>
|
|
</ul>
|
|
<div class="r-stack">
|
|
<img height="800" src="../pics/artwork/src/publishing/publishing_network_gitvsannex.svg">
|
|
</div>
|
|
</section>
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>Transport logistics</h2>
|
|
<ul>
|
|
<li>Scientific workflows can be idiosyncratic across institutions / departments / labs / any two scientists</li>
|
|
<li>DataLad is built to maximize interoperability and streamline routines across hosting and
|
|
storage technology</li>
|
|
</ul>
|
|
<div class="r-stack">
|
|
<img class="fragment fade-out" data-fragment-index="1" src="../pics/services_only.png" height="650">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="1" src="../pics/services_connected.png" height="650">
|
|
<img class="fragment fade-in" data-fragment-index="2" src="../pics/artwork/src/collaboration.svg" height="650">
|
|
</div>
|
|
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h2>Lots of data, little disk-usage</h2>
|
|
<ul>
|
|
<li class="fragment fade-in" data-fragment-index="1">
|
|
Cloned datasets are lean.
|
|
"Meta data" (file names, availability) are present, but <b>no file content</b>:</li>
|
|
<pre class="fragment fade-in" data-fragment-index="1"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad clone git@github.com:psychoinformatics-de/studyforrest-data-phase2.git
|
|
install(ok): /tmp/studyforrest-data-phase2 (dataset)
|
|
$ cd studyforrest-data-phase2 && du -sh
|
|
18M .</code></pre>
|
|
|
|
<li class="fragment fade-in" data-fragment-index="2">
|
|
files' contents can be retrieved on demand - and also dropped:
|
|
</li>
|
|
</ul>
|
|
<pre class="fragment fade-in" data-fragment-index="2"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad get sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
|
|
get(ok): /tmp/studyforrest-data-phase2/sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file) [from mddatasrc...]</code></pre>
|
|
|
|
<pre class="fragment fade-in" data-fragment-index="2"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad drop sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
|
|
drop(ok): /tmp/studyforrest-data-phase2/sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file) [checking https://arxiv.org/pdf/0904.3664v1.pdf...]</code></pre>
|
|
<li class="fragment fade-in" data-fragment-index="3">Have access to more data on your computer than you have disk-space:</li>
|
|
<pre class="fragment fade-in" data-fragment-index="3"><code># eNKI dataset (1.5TB, 34k files):
|
|
$ du -sh
|
|
1.5G .
|
|
# HCP dataset (~200TB, >15 million files)
|
|
$ du -sh
|
|
48G . </code></pre>
|
|
<pre class="fragment fade-in" data-fragment-index="3"><code class="python">dl.get('input/sub-01')
|
|
[really complex analysis]
|
|
dl.drop('input/sub-01')
|
|
</code></pre>
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Have yourself some data</h2>
|
|
<img src="../pics/openneuro_new_2.gif">
|
|
> 500TB of open data available at <a href="http://datasets.datalad.org/" target="_blank">
|
|
datasets.datalad.org
|
|
</a>
|
|
</section>
|
|
|
|
|
|
</section>
|
|
|
|
<section>
|
|
|
|
<section data-transition="fade">
|
|
<h2>Reusing past work</h2>
|
|
... isn't necessarily simple
|
|
<p class="fragment fade-in" data-fragment-index="1">Your past self is the worst collaborator:
|
|
<div class="r-stack">
|
|
<img src="../pics/legacycode_phd.png" height="500">
|
|
<img class="fragment fade-in" data-fragment-index="1" src="../pics/ownlegacycode_phd.png" height="500">
|
|
</div>
|
|
|
|
<imgcredit>Full comic at <a href="http://phdcomics.com/comics.php?f=1689">http://phdcomics.com/comics.php?f=1979</a></imgcredit>
|
|
</p>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h2>Lack of provenance can be devastating</h2>
|
|
|
|
<ul>
|
|
<li>Data analyses or data wrangling is complex</li>
|
|
<ul>
|
|
<li>Move/Copy/Rename/Reorganize/Transform/Compute/... data</li>
|
|
</ul>
|
|
<li>Mistakes propagate through the complete analysis pipeline -
|
|
especially those early ones are hard to find!</li>
|
|
</ul>
|
|
<img src="../pics/Provenance_alpha.png" height="600">
|
|
<imgcredit>CC-BY Scriberia and The Turing Way</imgcredit>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Leaving a trace </h2>
|
|
<p>"Shit, which version of which script produced these outputs from which version
|
|
of what data?"</p>
|
|
<p>
|
|
"Shit, why buttons did I click and in which order did I use all those tools?"</p>
|
|
<br>
|
|
<div class="r-stack">
|
|
<p>
|
|
<img src="../pics/manuallabor.png">
|
|
<img src="../pics/findfiles.png" height="400">
|
|
<imgcredit>CC-BY Scriberia and <a href="https://the-turing-way.netlify.app/reproducible-research/rdm.html" target="_blank">
|
|
The Turing Way</a>
|
|
</imgcredit>
|
|
</p>
|
|
</div>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Leaving a trace</h2>
|
|
<p class="fragment" data-fragment-index="1"> <strong>datalad run</strong> wraps around anything expressed in a command
|
|
line call and saves the dataset modifications resulting from the execution.</p>
|
|
<p class="fragment" data-fragment-index="2"> <strong>datalad rerun</strong> repeats captured executions.
|
|
If the outcomes
|
|
differ, it saves a new state of them.</p>
|
|
<p class="fragment" data-fragment-index="3"> <strong>datalad containers-run</strong> executes command
|
|
line calls inside a tracked software container and saves the dataset modifications resulting from the execution.</p>
|
|
|
|
<div class="r-stack">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="1" src="../pics/run_basic.svg" height="350">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/rerun.svg" height="350">
|
|
<img class="fragment fade-in" data-fragment-index="3" src="../pics/containers-run_basic.svg" height="350">
|
|
</div>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px">
|
|
Enshrine the analysis in a script
|
|
</p>
|
|
<p class="fragment fade-in" style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:180px;margin-bottom:-60px">
|
|
Here: extract_lc_timeseries.py
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="6">$ datalad containers-run \
|
|
--message "Time series extraction from Locus Coeruleus"
|
|
--container-name nilearn \
|
|
--input 'mri/*_bold.nii' \
|
|
--output 'sub-*/LC_timeseries_run-*.csv' \
|
|
"python3 code/extract_lc_timeseries.py"
|
|
|
|
-- Git commit --
|
|
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
|
|
Author: Adina Wagner adina.wagner@t-online.de
|
|
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
|
|
Commit: Adina Wagner adina.wagner@t-online.de
|
|
CommitDate: Mon Nov 11 16:15:08 2019 +0100
|
|
|
|
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
|
|
=== Do not change lines below ===
|
|
{
|
|
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
|
|
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
|
|
"inputs": [
|
|
"mri/*.bold.nii.gz",
|
|
".datalad/environments/nilearn.simg"
|
|
],
|
|
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
|
|
...
|
|
}
|
|
^^^ Do not change lines above ^^^
|
|
---
|
|
sub-01/LC_timeseries_run-1.csv | 1 +
|
|
...
|
|
</code></pre>
|
|
</p>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:130px;margin-bottom:-60px;margin-left:750px">
|
|
Record code execution together <br> with
|
|
input-data, output files and software
|
|
environment in the
|
|
execution-command
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="1-6">$ datalad containers-run \
|
|
--message "Time series extraction from Locus Coeruleus"
|
|
--container-name nilearn \
|
|
--input 'mri/*_bold.nii' \
|
|
--output 'sub-*/LC_timeseries_run-*.csv' \
|
|
"python3 code/extract_lc_timeseries.py"
|
|
|
|
-- Git commit --
|
|
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
|
|
Author: Adina Wagner adina.wagner@t-online.de
|
|
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
|
|
Commit: Adina Wagner adina.wagner@t-online.de
|
|
CommitDate: Mon Nov 11 16:15:08 2019 +0100
|
|
|
|
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
|
|
=== Do not change lines below ===
|
|
{
|
|
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
|
|
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
|
|
"inputs": [
|
|
"mri/*.bold.nii.gz",
|
|
".datalad/environments/nilearn.simg"
|
|
],
|
|
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
|
|
...
|
|
}
|
|
^^^ Do not change lines above ^^^
|
|
---
|
|
sub-01/LC_timeseries_run-1.csv | 1 +
|
|
...
|
|
</code></pre>
|
|
</p>
|
|
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px;margin-left:200px">
|
|
Result: machine readable record about which data, code, and <br>
|
|
software produced a result how, when, and why.
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="8-30">$ datalad containers-run \
|
|
--message "Time series extraction from Locus Coeruleus"
|
|
--container-name nilearn \
|
|
--input 'mri/*_bold.nii' \
|
|
--output 'sub-*/LC_timeseries_run-*.csv' \
|
|
"python3 code/extract_lc_timeseries.py"
|
|
|
|
-- Git commit --
|
|
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
|
|
Author: Adina Wagner adina.wagner@t-online.de
|
|
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
|
|
Commit: Adina Wagner adina.wagner@t-online.de
|
|
CommitDate: Mon Nov 11 16:15:08 2019 +0100
|
|
|
|
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
|
|
=== Do not change lines below ===
|
|
{
|
|
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
|
|
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
|
|
"inputs": [
|
|
"mri/*.bold.nii.gz",
|
|
".datalad/environments/nilearn.simg"
|
|
],
|
|
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
|
|
...
|
|
}
|
|
^^^ Do not change lines above ^^^
|
|
---
|
|
sub-01/LC_timeseries_run-1.csv | 1 +
|
|
...
|
|
</code></pre>
|
|
</p>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px;margin-left:350px">
|
|
Use the unique identifier of the execution record
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="1">$ datalad rerun 5a7565a640ff6de67
|
|
[INFO ] run commit 5a7565a640ff6de67; (Time series extraction from Locus Coeruleus)
|
|
[INFO ] Making sure inputs are available (this may take some time)
|
|
get(ok): mri/sub-01_bold.nii (file)
|
|
get(ok): mri/sub-02_bold.nii (file)
|
|
[...]
|
|
[INFO ] == Command start (output follows) =====
|
|
[INFO ] == Command exit (modification check follows) =====
|
|
add(ok): sub-01/LC_timeseries_run-*.csv(file)
|
|
add(ok): sub-02/LC_timeseries_run-*.csv (file)
|
|
[...]
|
|
action summary:
|
|
add (ok: 30)
|
|
get (ok: 30)
|
|
save (ok: 2)
|
|
unlock (ok: 30)
|
|
</code></pre>
|
|
</p>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:400px;margin-bottom:-60px;margin-left:350px">
|
|
... to have a machine recompute and verify past work
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="2-16">$ datalad rerun 5a7565a640ff6de67
|
|
[INFO ] run commit 5a7565a640ff6de67; (Time series extraction from Locus Coeruleus)
|
|
[INFO ] Making sure inputs are available (this may take some time)
|
|
get(ok): mri/sub-01_bold.nii (file)
|
|
get(ok): mri/sub-02_bold.nii (file)
|
|
[...]
|
|
[INFO ] == Command start (output follows) =====
|
|
[INFO ] == Command exit (modification check follows) =====
|
|
add(ok): sub-01/LC_timeseries_run-*.csv(file)
|
|
add(ok): sub-02/LC_timeseries_run-*.csv (file)
|
|
[...]
|
|
action summary:
|
|
add (ok: 30)
|
|
get (ok: 30)
|
|
save (ok: 2)
|
|
unlock (ok: 30)
|
|
</code></pre>
|
|
</section>
|
|
|
|
|
|
<section data-transition="None">
|
|
<h3>Provenance at the largest scale:</h3>
|
|
<ul>
|
|
<img height="350" src="../pics/fairly-big.png">
|
|
<img height="500" src="../pics/ukb_datasets.svg">
|
|
</ul>
|
|
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">www.nature.com/articles/s41597-022-01163-2</a> </small>
|
|
</section>
|
|
</section>
|
|
|
|
<section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Research data management is tied to reproducibility</h2>
|
|
<img src="../pics/fragile.png" height="800">
|
|
<imgcredit>Based on <a href="https://xkcd.com/2347/" target="_blank">
|
|
xkcd.com/2347/</a> (CC-BY)</imgcredit>
|
|
<small><a href="https://www.youtube.com/watch?v=nTVcMDVlyOI" target="_blank">
|
|
Reproducibility Management in Neuroscience -
|
|
Specific Issues and Solutions</a>
|
|
(<a href="https://doi.org/10.5281/zenodo.4285927" target="_blank">DOI 10.5281/zenodo.4285927</a>) </small>
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Further Information</h2>
|
|
<ul>
|
|
<br>
|
|
<ul style="font-size:30px">
|
|
<dt>Reach out to to the <b>DataLad</b> team via</dt>
|
|
<li>
|
|
<a href="https://matrix.to/#/!NaMjKIhMXhSicFdxAj:matrix.org?via=matrix.waite.eu&via=matrix.org&via=inm7.de" target="_blank">
|
|
Matrix</a> (free, decentralized communication app, no app needed).
|
|
We run a weekly Zoom office hour (Tuesday, 4pm Berlin time) from this room as well.
|
|
</li>
|
|
<li>the development repository on GitHub
|
|
<a href="https://github.com/datalad/datalad" target="_blank">
|
|
(github.com/datalad/datalad)</a>
|
|
</li>
|
|
<br>
|
|
<dt>Reach out to the user community with</dt>
|
|
<li>A question on <a href="https://neurostars.org/" target="_blank">neurostars.org</a>
|
|
with a <code>datalad</code> tag</li>
|
|
<br>
|
|
<dt>Find more user tutorials or workshop recordings</dt>
|
|
<li>On DataLad's YouTube channel <a href="https://www.youtube.com/channel/datalad" target="_blank">
|
|
(www.youtube.com/channel/datalad) </a>
|
|
</li>
|
|
<li>
|
|
In the DataLad Handbook<a href="http://handbook.datalad.org/en/latest/" target="_blank">
|
|
(handbook.datalad.org)</a>
|
|
</li>
|
|
<li>In the DataLad RDM course <a href="https://psychoinformatics-de.github.io/rdm-course/" target="_blank">
|
|
(psychoinformatics-de.github.io/rdm-course)</a> </li>
|
|
<li>In the Official API documentation <a href="http://docs.datalad.org" target="_blank">
|
|
(docs.datalad.org)</a> </li>
|
|
<br>
|
|
<li>On the advantages of decentralized research data management:
|
|
<a href="https://www.degruyter.com/document/doi/10.1515/nf-2020-0037/html" target="_blank">
|
|
doi.org/10.1515/nf-2020-0037
|
|
</a></li>
|
|
</ul>
|
|
</ul>
|
|
<br>
|
|
<br>
|
|
Install it on your own hardware: <a href="http://handbook.datalad.org/r.html?install" target="_blank">handbook.datalad.org/r.html?install</a>
|
|
</section>
|
|
<section>
|
|
<h2>Acknowledgements</h2>
|
|
<table>
|
|
<tr style="vertical-align:top">
|
|
<td style="vertical-align:top">
|
|
<dl>
|
|
<dt>Software</dt>
|
|
<dd style="margin-left:5px!important">
|
|
<ul style="margin-left:5px!important">
|
|
<li>Joey Hess (git-annex)</li>
|
|
<li>The DataLad team &
|
|
contributors</li>
|
|
</ul>
|
|
</dd>
|
|
<br> <br>
|
|
<h2>Thanks!</h2>
|
|
<br>
|
|
<img height="300px" src="../pics/neuraltraces.png"><br>
|
|
<small>(scan the QR code for slides)</small>
|
|
</dl>
|
|
</td>
|
|
<td style="vertical-align:top">
|
|
<div style="margin-bottom:-20px;text-align:center"><strong>Funders</strong></div>
|
|
<img style="height:150px;margin-right:50px" data-src="../pics/nsf_2020.png" />
|
|
<img style="height:150px;margin-right:50pxi;margin-left:50px" data-src="../pics/binc.png" />
|
|
<img style="height:150px;margin-left:50px" data-src="../pics/bmbf_2020.png" />
|
|
<img style="height:80px;margin-top:-40px;margin-left:auto;margin-right:auto;width:100%" data-src="../pics/fzj_logo.svg" />
|
|
<div style="margin-top:-20px">
|
|
<img style="height:60px;margin-right:20px" data-src="../pics/erdf.png" />
|
|
<img style="height:60px;margin-right:20px" data-src="../pics/cbbs_logo.png" />
|
|
<img style="height:60px" data-src="../pics/LSA-Logo.png" />
|
|
</div>
|
|
<div style="margin-top:40px;margin-bottom:20px;text-align:center"><strong>Collaborators</strong></div>
|
|
<div style="margin-top:-20px">
|
|
<img style="height:100px;margin:20px" data-src="../pics/hbp_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/conp_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/vbc_logo.png" />
|
|
</div>
|
|
<div style="margin-top:-40px">
|
|
<img style="height:120px;margin:20px" data-src="../pics/openneuro_logo.png" />
|
|
<img style="height:120px;margin:20px" data-src="../pics/cbrain_logo.png" />
|
|
<img style="height:140px;margin:20px" data-src="../pics/brainlife_logo.png" />
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
</div>
|
|
</div>
|
|
|
|
<script src="../reveal.js/dist/reveal.js"></script>
|
|
<script src="../reveal.js/plugin/notes/notes.js"></script>
|
|
<script src="../reveal.js/plugin/markdown/markdown.js"></script>
|
|
<script src="../reveal.js/plugin/highlight/highlight.js"></script>
|
|
<script>
|
|
// More info about initialization & config:
|
|
// - https://revealjs.com/initialization/
|
|
// - https://revealjs.com/config/
|
|
Reveal.initialize({
|
|
hash: true,
|
|
// The "normal" size of the presentation, aspect ratio will be preserved
|
|
// when the presentation is scaled to fit different resolutions. Can be
|
|
// specified using percentage units.
|
|
width: 1280,
|
|
height: 960,
|
|
// Factor of the display size that should remain empty around the content
|
|
margin: 0.1,
|
|
// Bounds for smallest/largest possible scale to apply to content
|
|
minScale: 0.2,
|
|
maxScale: 1.0,
|
|
|
|
controls: true,
|
|
progress: true,
|
|
history: true,
|
|
center: true,
|
|
slideNumber: 'c',
|
|
pdfSeparateFragments: false,
|
|
pdfMaxPagesPerSlide: 1,
|
|
pdfPageHeightOffset: -1,
|
|
transition: 'slide', // none/fade/slide/convex/concave/zoom
|
|
// Learn about plugins: https://revealjs.com/plugins/
|
|
plugins: [ RevealMarkdown, RevealHighlight, RevealNotes ]
|
|
});
|
|
</script>
|
|
</body>
|
|
</html>
|