1133 lines
48 KiB
HTML
1133 lines
48 KiB
HTML
<!doctype html>
|
|
<html>
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
|
|
|
|
<!-- Edit me start! -->
|
|
<title>DataLad @ NHR </title>
|
|
<meta name="description" content="Decentral Management of Digital Objects for Open Science">
|
|
<meta name="author" content="Adina Wagner">
|
|
<!-- Edit me end! -->
|
|
|
|
<link rel="stylesheet" href="../reveal.js/dist/reset.css">
|
|
<link rel="stylesheet" href="../reveal.js/dist/reveal.css">
|
|
<link rel="stylesheet" href="../reveal.js/dist/theme/beige.css">
|
|
<link rel="stylesheet" href="../css/main.css">
|
|
|
|
<!-- Theme used for syntax highlighted code -->
|
|
<link rel="stylesheet" href="../reveal.js/plugin/highlight/monokai.css">
|
|
</head>
|
|
<body>
|
|
<div class="reveal">
|
|
<div class="slides">
|
|
|
|
<section>
|
|
<section>
|
|
<h2>DataLad</h2>
|
|
<h3>Decentralized Management of Digital Objects for Open Science</h3>
|
|
|
|
<div style="margin-top:1em;text-align:center">
|
|
<table style="border: none;">
|
|
<tr>
|
|
<td style="border: none;">Dr. Adina Wagner
|
|
<br><small>
|
|
<a href="https://mas.to/@adswa" target="_blank">
|
|
<img data-src="../pics/mastodon.svg" style="height:30px;margin:0px" />
|
|
mas.to/@adswa</a></small></td>
|
|
<td style="border: none;">
|
|
<br></td>
|
|
</tr>
|
|
<tr>
|
|
<td style="border: none; vertical-align:top">
|
|
<small>
|
|
<br> Institute of Neuroscience and
|
|
Medicine, Brain & Behavior (INM-7)<br>
|
|
Research Center Jülich</small><br>
|
|
</td>
|
|
<td><img style="height:100px;margin-right:10px" data-src="../pics/fzj_logo.png" /></td>
|
|
</tr>
|
|
</table>
|
|
</div>
|
|
<!-- <p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:0px;margin-bottom:100px;margin-left:1000px">
|
|
<img src="../pics/qr_nhr.png" height="200">
|
|
</p>-->
|
|
<br><br><small>
|
|
|
|
Slides: <a href="https://doi.org/10.5281/zenodo.15193934" target="_blank">
|
|
DOI 10.5281/zenodo.15193934</a> (Scan the QR code) <br>
|
|
<a href="https://files.inm7.de/adina/talks/html/nhr_2025_datalad.html" target="_blank">
|
|
files.inm7.de/adina/talks/html/nhr_2025_datalad.html</a></small>
|
|
</small>
|
|
</a>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h2>Acknowledgements</h2>
|
|
<table>
|
|
<tr style="vertical-align:middle">
|
|
<td style="vertical-align:middle">
|
|
<dl>
|
|
<dt style="margin-top:20px">DataLad software <br>
|
|
& ecosystem</dt>
|
|
<dd style="margin-left:5px!important">
|
|
<ul style="margin-left:5px!important">
|
|
<li>Psychoinformatics Lab, <br>
|
|
Research Centre Jülich</li>
|
|
<li>Center for Open <br>
|
|
Neuroscience, <br>
|
|
Dartmouth College</li>
|
|
<li>Joey Hess (git-annex)</li>
|
|
<li><em>>100 additional contributors</em></li>
|
|
</ul>
|
|
</dd>
|
|
</td>
|
|
<td style="vertical-align:middle">
|
|
<div style="margin-bottom:-20px;text-align:center"><strong>Funders</strong></div>
|
|
<img style="height:150px;margin-right:50px" data-src="../pics/nsf.png" />
|
|
<img style="height:150px;margin-right:50pxi;margin-left:50px" data-src="../pics/binc.png" />
|
|
<img style="height:150px;margin-left:50px" data-src="../pics/bmbf.png" />
|
|
<div style="margin-top:-20px">
|
|
<img style="height:80px;margin-top:-40px;margin-left:40px" data-src="../pics/fzj_logo.svg" />
|
|
<img style="height:60px;margin-left:50px;margin-bottom:25px" data-src="../pics/dfg_logo.png" />
|
|
</div>
|
|
<div style="margin-top:-20px">
|
|
<img style="height:60px;margin-right:20px" data-src="../pics/erdf.png" />
|
|
<img style="height:60px;margin-right:20px" data-src="../pics/cbbs_logo.png" />
|
|
<img style="height:60px" data-src="../pics/LSA-Logo.png" />
|
|
</div>
|
|
<div style="margin-top:40px;margin-bottom:20px;text-align:center"><strong>Collaborators</strong></div>
|
|
<div style="margin-top:-20px">
|
|
<img style="height:100px;margin:20px" data-src="../pics/hbp_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/conp_logo.png" />
|
|
<img style="height:120px;margin:10px" data-src="../pics/openneuro_logo.png" />
|
|
</div>
|
|
<div style="margin-top:-40px">
|
|
<img style="height:100px;margin:20px" data-src="../pics/ebrains-logo.png"/>
|
|
<img style="height:100px;margin:0px" data-src="../pics/gin-logo.png" />
|
|
<img style="height:120px;margin:10px" data-src="../pics/sfb1451_logo.png" />
|
|
</div>
|
|
<div style="margin-top:-40px;align:middle">
|
|
<img style="height:140px;margin:10px" data-src="../pics/brainlife_logo.png" />
|
|
<img style="height:100px;margin:0px" data-src="../pics/cbrain_logo.png" />
|
|
<img style="height:100px;margin:20px" data-src="../pics/vbc_logo.png" />
|
|
</div>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
</section>
|
|
</section>
|
|
|
|
<!-- Data Management and DataLad -->
|
|
|
|
|
|
<section>
|
|
|
|
<!-- Show of hands who has seen this image.
|
|
What is it that people hint at when they show this image? (Git)
|
|
-->
|
|
<section data-transition="None">
|
|
|
|
<h3 class="fragment fade-in" data-fragment-index="1">The building blocks of a scientific result are rarely static</h3>
|
|
<table>
|
|
<tr>
|
|
<div class="r-stack">
|
|
<p class="fragment fade-in-then-out" data-fragment-index="2">Mar 2019</p>
|
|
<p class="fragment fade-in-then-out" data-fragment-index="3">Spring 2019</p>
|
|
<p class="fragment fade-in-then-out" data-fragment-index="4">July 2019</p>
|
|
<p class="fragment fade-in-then-out" data-fragment-index="5">Dec 2019</p>
|
|
<!-- <p class="fragment fade-in-then-out" data-fragment-index="6">Mar 2025 <br>
|
|
<small> <a href="https://www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification/" target="_blank">
|
|
www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification</a> </small></p>-->
|
|
</div>
|
|
</tr>
|
|
<tr>
|
|
<div class="r-stack">
|
|
<img class="fragment fade-out" data-fragment-index="2" src="../pics/phd052810s.png" height="700">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/abcd_data_issues1.png">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/abcd_data_issues2.png">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="4" src="../pics/abcd_data_issues3.png">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="5" src="../pics/abcd_data_issues4.png">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/frontend_vs_backend_paper.png">
|
|
<!-- <img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/nda_review.png"> -->
|
|
</div>
|
|
<imgcredit class="fragment fade-out" data-fragment-index="1">Piled Higher and Deeper
|
|
<a href="https://phdcomics.com/comics/archive_print.php?comicid=1323" target="_blank">
|
|
1323
|
|
</a> </imgcredit></td>
|
|
|
|
</tr>
|
|
</table>
|
|
|
|
<div class="r-stack">
|
|
<p style="vertical-align:middle" class="fragment fade-in-then-out" data-fragment-index="1"><u>Data</u> changes <br>
|
|
<small>(errors are fixed, data is extended,<br>
|
|
naming standards change, an analysis <br>
|
|
requires only a subset of your data...)</small></p>
|
|
<p style="vertical-align:middle" class="fragment fade-in" data-fragment-index="2">
|
|
<small>source: <a href="source: https://abcdstudy.org/scientists/data-sharing-archive" target="_blank">abcdstudy.org/scientists/data-sharing-archive</a> </small></p>
|
|
</div>
|
|
</section>
|
|
</section>
|
|
|
|
<section>
|
|
<section>
|
|
<img style="height:300px; margin-top: 0; margin-right:1px;vertical-align:middle;" src="../pics/datalad_logo_wide.svg" alt="">
|
|
<br>
|
|
<ul style="font-size:37px">
|
|
<li>Domain-agnostic <strong>command-line tool</strong>
|
|
(+ <strong>graphical user interface</strong>),
|
|
built on top of <a href="https://git-scm.com/" target="_blank">Git</a>
|
|
& <a href="https://git-annex.branchable.com/" target="_blank">Git-annex</a></li>
|
|
<li>Open source (MIT) research software developed since 2013</li>
|
|
<li>Available for all major operating systems</li>
|
|
<li>Major features:</li>
|
|
<dt>Version-controlling arbitrarily large content </dt>
|
|
<dd>Version control data & software alongside to code!</dd>
|
|
<dt>Transport mechanisms for sharing & obtaining data </dt>
|
|
<dd>Consume & collaborate on data (analyses) like software</dd>
|
|
<dt>(Computationally) reproducible data analysis</dt>
|
|
<dd>Track and share provenance of all digital objects</dd>
|
|
<dt>(... and <i>much</i> more) </dt>
|
|
<br>
|
|
</ul>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<img src="../pics/vamp_0_start.png"><br><br>
|
|
A DataLad dataset is a joint Git/git-annex repository that can version control any file
|
|
<br><br>
|
|
<table width=100% style="padding:0px">
|
|
<tr><td style="padding:0px">
|
|
<code><pre>
|
|
# turn any directory into a dataset
|
|
# with version control
|
|
|
|
% datalad create <directory>
|
|
</pre></code>
|
|
</td><td style="padding:0px">
|
|
<code><pre>
|
|
# save a new state of a dataset with
|
|
# file content of any size
|
|
|
|
% datalad save
|
|
</pre></code>
|
|
</td></tr></table>
|
|
</section>
|
|
<section data-transition="None">
|
|
<img src="../pics/vamp_1_provcapture.png">
|
|
<br><br>
|
|
Which data (at which version), with which code, running with what parameterization in which
|
|
computational environment, to generate what?<br><br>
|
|
|
|
<table width=100% style="padding:0px">
|
|
<tr><td style="padding:0px">
|
|
<code><pre>
|
|
# execute any command and capture its output
|
|
# while recording all input versions too
|
|
|
|
% datalad run --input ... --output ... <command>
|
|
</pre></code>
|
|
</td></tr></table>
|
|
</section>
|
|
<section data-transition="None">
|
|
<img src="../pics/vamp_2_pushtocloud.png">
|
|
<br><br>
|
|
Decentral data transport to Git hosting, local or remote infrastructure, or external hosting services
|
|
<br><br>
|
|
|
|
<table width=100% style="padding:0px">
|
|
<tr><td style="padding:0px">
|
|
<code><pre>
|
|
# transfer data and metadata to other sites and services
|
|
# with fine-grained access control for dataset components
|
|
|
|
% datalad push --to <site-or-service>
|
|
</pre></code>
|
|
</td></tr></table>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<img src="../pics/vamp_3_reproduce.png">
|
|
<br><br>
|
|
Outcomes can be validated. This enables audits, promotes accountability, and streamlines automated "upgrades" of outputs
|
|
<br><br>
|
|
<table width=100% style="padding:0px">
|
|
<tr><td style="padding:0px">
|
|
<code><pre>
|
|
# obtain dataset (initially only identity,
|
|
# availability, and provenance metadata)
|
|
|
|
% datalad clone <url>
|
|
</pre></code>
|
|
</td><td style="padding:0px">
|
|
<code><pre>
|
|
# immediately actionable provenance records
|
|
# full abstraction of input data retrieval
|
|
|
|
% datalad rerun <commit|tag|range>
|
|
</pre></code>
|
|
</td></tr></table>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<img src="../pics/vamp_4_reuse.png">
|
|
<br>Datasets can be (re-)used as modular components in larger contexts — propagating
|
|
their traits. They are verifiable, portable, self-contained data structures
|
|
<br><br>
|
|
<table width=100% style="padding:0px">
|
|
<tr><td style="padding:0px">
|
|
<code><pre>
|
|
# declare a dependency on another dataset and
|
|
# re-use it a particular state in a new context
|
|
|
|
% datalad clone -d <superdataset> <url> <path-in-dataset>
|
|
</pre></code>
|
|
</td></tr></table>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h2>Version control beyond text files</h2>
|
|
<p class="fragment fade-in" data-fragment-index="2">
|
|
<img class="fragment fade-in" data-fragment-index="2" src="../pics/gitannex.png" height="100px">
|
|
Using <a href="https://git-annex.branchable.com" target="_blank">git-annex</a>,
|
|
<a href="https://datalad.org" target="_blank">DataLad</a> version controls large data
|
|
<img class="fragment fade-in" data-fragment-index="2" src="../pics/datalad_logo_wide.svg" height="100px"></p>
|
|
<div class="r-stack">
|
|
<img class="fragment fade-in" height="500" data-fragment-index="3" src="../pics/tigdata.png">
|
|
<img class="fragment fade-in" height="500" data-fragment-index="4" src="../pics/tigdata3.png">
|
|
<img class="fragment fade-in" height="500" data-fragment-index="5" src="../pics/tigdata2.png">
|
|
</div>
|
|
</section>
|
|
|
|
<section data-transition="None" style="font-size:35px">
|
|
<h2>Version control beyond text files</h2>
|
|
<ul>
|
|
<li>Datasets have an <b>annex</b> to track files without
|
|
placing their content into Git</li>
|
|
<li>Rather than content, <strong>identity</strong> (hash) and <strong>location</strong> information is put into Git:</li>
|
|
<ul>
|
|
<li class="fragment fade-in" data-fragment-index="0">Where the filesystem allows it, annexed files are symlinks:</li>
|
|
</ul>
|
|
</ul>
|
|
<pre class="fragment fade-in" data-fragment-index="0"><code class="fragment fade-in;language-bash" style="max-width:none" data-fragment-index="0">$ ls -l sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
|
|
lrwxrwxrwx 1 adina adina 142 Jul 22 19:45 sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz ->
|
|
../../.git/annex/objects/kZ/K5/MD5E-s24180157--aeb0e5f2e2d5fe4ade97117a8cc5232f.nii.gz/MD5E-s24180157
|
|
--aeb0e5f2e2d5fe4ade97117a8cc5232f.nii.gz
|
|
</code></pre><small class="fragment fade-in" data-fragment-index="0">(PS: especially useful in datasets with many identical files) </small>
|
|
<ul><ul>
|
|
<li class="fragment fade-in" data-fragment-index="1">The symlink reveals: This internal data organization based on identity hash</li>
|
|
</ul>
|
|
</ul>
|
|
<pre class="fragment fade-in" data-fragment-index="1"><code class="fragment fade-in;language-bash" data-fragment-index="1">$ md5sum sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
|
|
aeb0e5f2e2d5fe4ade97117a8cc5232f sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
|
|
</code></pre>
|
|
<ul><ul>
|
|
<li class="fragment fade-in" data-fragment-index="2">The (tiny) symlink instead of the (potentially large) file content is
|
|
committed - version controlling precise file identity without checking contents into Git
|
|
<img src="../pics/annex-commit.png"></li>
|
|
<li class="fragment fade-in" data-fragment-index="3">File availability information is stored to
|
|
record a decentral network of file content.
|
|
A file can exist in multiple different locations.</li>
|
|
</ul></ul>
|
|
<pre class="fragment fade-in" data-fragment-index="3"><code class="fragment fade-in;language-bash" data-fragment-index="1">$ git annex whereis sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz
|
|
whereis sub-02/func/sub-02_task-oneback_run-01_bold.nii.gz (2 copies)
|
|
8c3680dd-6165-4749-adaa-c742232bc317 -- git@8242caf9acd8:/data/repos/adswa/bidsdata.git [gin]
|
|
fff8fdbc-3185-4b78-bd12-718717588442 -- adina@muninn:~/bids-data [here]
|
|
ok
|
|
</code></pre>
|
|
</section>
|
|
|
|
|
|
|
|
<section>
|
|
<h2>Git versus Git-annex</h2>
|
|
<dl>
|
|
<dt>Data in datasets is either stored in Git or git-annex</dt>
|
|
<dd>By default, everything is <i>annexed</i>, i.e., stored in a dataset annex</dd>
|
|
</dl>
|
|
<img height="400" src="../pics/artwork/src/publishing/publishing_gitvsannex.svg">
|
|
<small>
|
|
<table>
|
|
<tr>
|
|
<td><b>Git</b></td>
|
|
<td><b>git-annex</b></td>
|
|
</tr>
|
|
<tr>
|
|
<td>handles <b>small</b> files well (text, code)</td>
|
|
<td>handles <b>all</b> types and sizes of files well</td>
|
|
</tr>
|
|
<tr>
|
|
<td>file contents are in the Git history
|
|
and will be <b>shared</b> upon git/datalad push</td>
|
|
<td>file contents are in the annex. Not necessarily shared</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Shared with every dataset clone</td>
|
|
<td><b>Can be kept private</b> on a per-file level when sharing the dataset</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Useful: Small, non-binary, frequently modified, need-to-be-accessible (DUA, README) files </td>
|
|
<td>Useful: Large files, private files</td>
|
|
</tr>
|
|
</table>
|
|
</small>
|
|
</section>
|
|
</section>
|
|
|
|
<section>
|
|
|
|
<section>
|
|
<h2>(Raw) data mismanagement</h2>
|
|
<ul>
|
|
<li>Multiple large datasets are available on a compute cluster 🏞 </li>
|
|
<li>Each researcher creates their own copies of data ⛰ </li>
|
|
<li>Multiple different derivatives and results are computed from it 🏔</li>
|
|
<li>Data, copies of data, half-baked data transformations, results, and
|
|
old versions of results are kept - undocumented 🌋 </li>
|
|
</ul>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Share data like source code</h2>
|
|
<div class="r-stack">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="0" src="../pics/centralmanagement2.gif" alt="a screenrecording of cloning an institutional superdataset from GitLab">
|
|
<img class="fragment fade-in" data-fragment-index="1" style="box-shadow: 5px 5px 3px #888888" height="330" src="../pics/artwork/src/collaboration.svg">
|
|
</div>
|
|
<aside class="notes">
|
|
Idea behind datalad: Enable a similar level of tooling and culture for the distribution and version control of data as it is present for open source software development
|
|
</aside>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<h3>Transport logistics: Lots of data, little disk-usage</h3>
|
|
<ul>
|
|
<li class="fragment fade-in">
|
|
Cloned datasets are lean.
|
|
"Meta data" (file names, availability) are present, but <b>no file content</b>:</li>
|
|
<pre class="fragment fade-in"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad clone git@github.com:psychoinformatics-de/studyforrest-data-phase2.git
|
|
install(ok): /tmp/studyforrest-data-phase2 (dataset)
|
|
$ cd studyforrest-data-phase2 && du -sh
|
|
18M .</code></pre>
|
|
|
|
<li class="fragment fade-in">
|
|
files' contents can be retrieved on demand:
|
|
</li>
|
|
</ul>
|
|
<pre class="fragment fade-in"><code data-trim class="language-bash" onmousemove="showHover(event)" onmousedown="clickCopy(event)" onmouseleave="leaveElement(event)">$ datalad get sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
|
|
get(ok): /tmp/studyforrest-data-phase2/sub-01/ses-movie/func/
|
|
sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file) [from mddatasrc...]</code></pre>
|
|
|
|
<ul>
|
|
<li class="fragment fade-in">Have access to more data on your computer than you have disk-space:</li>
|
|
<pre class="fragment fade-in"><code># eNKI dataset (1.5TB, 34k files):
|
|
$ du -sh
|
|
1.5G .
|
|
# HCP dataset (~200TB, >15 million files)
|
|
$ du -sh
|
|
48G . </code></pre>
|
|
</ul>
|
|
</section>
|
|
|
|
<section data-markdown data-transition="None"> <script type="text/template">
|
|
## Plenty of data, but little disk-usage
|
|
|
|
Drop file content that is not needed:<!-- .element: class="fragment fade-in" -->
|
|
<pre class="fragment fade-in"><code data-trim class="language-bash">$ datalad drop sub-01/ses-movie/func/sub-01_ses-movie_task-movie_run-1_bold.nii.gz
|
|
drop(ok): /[...]/sub-01_ses-movie_task-movie_run-1_bold.nii.gz (file)
|
|
</code></pre>
|
|
Only "meta data" stays behind, and files can be re-obtained on demand. This allows for disk-space-aware computing workflows:<!-- .element: class="fragment fade-in" -->
|
|
<pre><code class="python">dl.get('input/sub-01')
|
|
[really complex analysis]
|
|
dl.drop('input/sub-01')
|
|
</code></pre><!-- .element: class="fragment fade-in" -->
|
|
</script></section>
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>Publishing datasets</h2>
|
|
<ul>
|
|
Publish datasets, their annexed contents, or both to infrastructure of your choice
|
|
</ul>
|
|
<div class="r-stack">
|
|
<img data-fragment-index="0" height="600" src="../pics/artwork/src/publishing/publishing_network_gitvsannex.svg">
|
|
</div>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Interoperability</h2>
|
|
<ul>
|
|
<li>DataLad is built to maximize interoperability and streamline routines across hosting and
|
|
storage technology</li>
|
|
</ul>
|
|
<img src="../pics/services_connected.png" height="650">
|
|
</section>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
<!-- on modularity -->
|
|
<section data-markdown><script type="text/template">
|
|
## Modularity
|
|
|
|
<!-- .element: height="500" -->
|
|
|
|
- Typical workflow in science
|
|
- Prior works (algorithm development, empirical data, etc.) are combined
|
|
to produce novel results with to goal of a publication
|
|
- **Aggregation across time and contributors**
|
|
- Aiming for (but often failing) to be reproducible
|
|
</script>
|
|
</section>
|
|
|
|
<section data-markdown><script type="text/template">
|
|
## Version control beyond single repositories
|
|
|
|
- **Why** are multiple repositories needed (in science)?
|
|
|
|
- Size impacts I/O and logistics
|
|
- Git can struggle with 1M+ files or 100k+ commits
|
|
- Filesystems (licensing) can struggle with large numbers of inodes
|
|
|
|
- Target audience is different
|
|
- Public vs. private or personal vs. anonymized data
|
|
|
|
- Pace of evolution or access patterns are different
|
|
- "Factual" raw data vs. choices of (pre-)processing
|
|
- Completed acquisition vs. ongoing study
|
|
<!-- .element: height="200" -->
|
|
- A **single repository is not enough**, but Git/Git-annex are not optimized
|
|
for such use cases
|
|
|
|
</script>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Dataset Nesting</h2>
|
|
|
|
<ul>
|
|
<li>Seamless nesting mechanisms:
|
|
<img height="330" src="../pics/artwork/src/linkage_subds.svg">
|
|
<ul>
|
|
<li>hierarchies of datasets in super-/sub-dataset relationships</li>
|
|
<li>based on Git submodules, but more seamless: Mono-repo feel thanks to recursive operations</li>
|
|
</ul>
|
|
<li class="fragment fade-in" data-fragment-index="2">Overcomes scaling issues with large amounts of files</li>
|
|
<pre class="fragment fade-in" data-fragment-index="2"><code>adina@bulk1 in /ds/hcp/super on git:master❱ datalad status --annex -r
|
|
15530572 annex'd files (77.9 TB recorded total size)
|
|
nothing to save, working tree clean</code></pre>
|
|
<small><a class="fragment fade-in" data-fragment-index="2" href="https://github.com/datalad-datasets/human-connectome-project-openaccess" target="_blank">(github.com/datalad-datasets/human-connectome-project-openaccess)</a></small>
|
|
<li class="fragment fade-in">Modularizes research components for transparency, reuse, and access management</li>
|
|
</ul>
|
|
</section>
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>Intuitive data analysis structure</h2>
|
|
|
|
<li>You can link datasets together in superdataset-subdataset hierarchies:</li>
|
|
<img src="../pics/artwork/src/linkage_subds.svg" width="900"> <br>
|
|
<pre><code style="max-width:none" class="bash" data-line-numbers="1,3, 6">$ cd myanalysis
|
|
# we can install analysis input data as a subdataset to the dataset
|
|
$ datalad clone -d . https://github.com/datalad-handbook/iris_data.git input/
|
|
[INFO ] Scanning for unlocked files (this may take some time)
|
|
[INFO ] Remote origin not usable by git-annex; setting annex-ignore
|
|
install(ok): input (dataset)
|
|
add(ok): input (file)
|
|
add(ok): .gitmodules (file)
|
|
save(ok): . (dataset)
|
|
action summary:
|
|
add (ok: 2)
|
|
install (ok: 1)
|
|
save (ok: 1)
|
|
</code></pre>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Leaving a trace </h2>
|
|
<p>"Shit, which version of which script produced these outputs from which version
|
|
of what data?"</p>
|
|
<p>
|
|
"Shit, why buttons did I click and in which order did I use all those tools?"</p>
|
|
<br>
|
|
<p>
|
|
<img src="../pics/manuallabor.png">
|
|
<img src="../pics/findfiles.png" height="400">
|
|
<img src="../pics/projectstack.png" height="350">
|
|
<imgcredit>CC-BY Scriberia and <a href="https://the-turing-way.netlify.app/reproducible-research/rdm.html" target="_blank">
|
|
The Turing Way</a>
|
|
</imgcredit>
|
|
</p>
|
|
</section>
|
|
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>Leaving a trace</h2>
|
|
<p class="fragment" data-fragment-index="1"> <strong>datalad run</strong> wraps around anything expressed in a command
|
|
line call and saves the dataset modifications resulting from the execution.</p>
|
|
<p class="fragment" data-fragment-index="2"> <strong>datalad rerun</strong> repeats captured executions.
|
|
If the outcomes
|
|
differ, it saves a new state of them.</p>
|
|
<p class="fragment" data-fragment-index="3"> <strong>datalad containers-run</strong> executes command
|
|
line calls inside a tracked software container and saves the dataset modifications resulting from the execution.</p>
|
|
|
|
<div class="r-stack">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="1" src="../pics/run_basic.svg" height="350">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/rerun.svg" height="350">
|
|
<img class="fragment fade-in" data-fragment-index="3" src="../pics/containers-run_basic.svg" height="350">
|
|
</div>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px">
|
|
Enshrine the analysis in a script
|
|
</p>
|
|
<p class="fragment fade-in" style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:180px;margin-bottom:-60px">
|
|
Here: extract_lc_timeseries.py
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="6">$ datalad containers-run \
|
|
--message "Time series extraction from Locus Coeruleus"
|
|
--container-name nilearn \
|
|
--input 'mri/*_bold.nii' \
|
|
--output 'sub-*/LC_timeseries_run-*.csv' \
|
|
"python3 code/extract_lc_timeseries.py"
|
|
|
|
-- Git commit --
|
|
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
|
|
Author: Adina Wagner adina.wagner@t-online.de
|
|
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
|
|
Commit: Adina Wagner adina.wagner@t-online.de
|
|
CommitDate: Mon Nov 11 16:15:08 2019 +0100
|
|
|
|
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
|
|
=== Do not change lines below ===
|
|
{
|
|
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
|
|
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
|
|
"inputs": [
|
|
"mri/*.bold.nii.gz",
|
|
".datalad/environments/nilearn.simg"
|
|
],
|
|
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
|
|
...
|
|
}
|
|
^^^ Do not change lines above ^^^
|
|
---
|
|
sub-01/LC_timeseries_run-1.csv | 1 +
|
|
...
|
|
</code></pre>
|
|
</p>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:130px;margin-bottom:-60px;margin-left:750px">
|
|
Record code execution together <br> with
|
|
input-data, output files and software
|
|
environment in the
|
|
execution-command
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="1-6">$ datalad containers-run \
|
|
--message "Time series extraction from Locus Coeruleus"
|
|
--container-name nilearn \
|
|
--input 'mri/*_bold.nii' \
|
|
--output 'sub-*/LC_timeseries_run-*.csv' \
|
|
"python3 code/extract_lc_timeseries.py"
|
|
|
|
-- Git commit --
|
|
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
|
|
Author: Adina Wagner adina.wagner@t-online.de
|
|
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
|
|
Commit: Adina Wagner adina.wagner@t-online.de
|
|
CommitDate: Mon Nov 11 16:15:08 2019 +0100
|
|
|
|
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
|
|
=== Do not change lines below ===
|
|
{
|
|
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
|
|
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
|
|
"inputs": [
|
|
"mri/*.bold.nii.gz",
|
|
".datalad/environments/nilearn.simg"
|
|
],
|
|
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
|
|
...
|
|
}
|
|
^^^ Do not change lines above ^^^
|
|
---
|
|
sub-01/LC_timeseries_run-1.csv | 1 +
|
|
...
|
|
</code></pre>
|
|
</p>
|
|
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px;margin-left:200px">
|
|
Result: machine readable record about which data, code, and <br>
|
|
software produced a result how, when, and why.
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="8-30">$ datalad containers-run \
|
|
--message "Time series extraction from Locus Coeruleus"
|
|
--container-name nilearn \
|
|
--input 'mri/*_bold.nii' \
|
|
--output 'sub-*/LC_timeseries_run-*.csv' \
|
|
"python3 code/extract_lc_timeseries.py"
|
|
|
|
-- Git commit --
|
|
commit 5a7565a640ff6de67e07292a26bf272f1ee4b00e
|
|
Author: Adina Wagner adina.wagner@t-online.de
|
|
AuthorDate: Mon Nov 11 16:15:08 2019 +0100
|
|
Commit: Adina Wagner adina.wagner@t-online.de
|
|
CommitDate: Mon Nov 11 16:15:08 2019 +0100
|
|
|
|
[DATALAD RUNCMD] Time series extraction from Locus Coeruleus
|
|
=== Do not change lines below ===
|
|
{
|
|
"cmd": "singularity exec --bind {pwd} .datalad/environments/nilearn.simg bash..",
|
|
"dsid": "92ea1faa-632a-11e8-af29-a0369f7c647e",
|
|
"inputs": [
|
|
"mri/*.bold.nii.gz",
|
|
".datalad/environments/nilearn.simg"
|
|
],
|
|
"outputs": ["sub-*/LC_timeseries_run-*.csv"],
|
|
...
|
|
}
|
|
^^^ Do not change lines above ^^^
|
|
---
|
|
sub-01/LC_timeseries_run-1.csv | 1 +
|
|
...
|
|
</code></pre>
|
|
</p>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:60px;margin-bottom:-60px;margin-left:350px">
|
|
Use the unique identifier of the execution record
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="1">$ datalad rerun 5a7565a640ff6de67
|
|
[INFO ] run commit 5a7565a640ff6de67; (Time series extraction from Locus Coeruleus)
|
|
[INFO ] Making sure inputs are available (this may take some time)
|
|
get(ok): mri/sub-01_bold.nii (file)
|
|
get(ok): mri/sub-02_bold.nii (file)
|
|
[...]
|
|
[INFO ] == Command start (output follows) =====
|
|
[INFO ] == Command exit (modification check follows) =====
|
|
add(ok): sub-01/LC_timeseries_run-*.csv(file)
|
|
add(ok): sub-02/LC_timeseries_run-*.csv (file)
|
|
[...]
|
|
action summary:
|
|
add (ok: 30)
|
|
get (ok: 30)
|
|
save (ok: 2)
|
|
unlock (ok: 30)
|
|
</code></pre>
|
|
</p>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>data analysis provenance</h2>
|
|
<p style="z-index: 100;position: fixed;background-color:#ede6d5;font-size:35px;box-shadow: 10px 10px 8px #888888;margin-top:400px;margin-bottom:-60px;margin-left:350px">
|
|
... to have a machine recompute and verify past work
|
|
</p>
|
|
<p style="z-index: -1">
|
|
<pre><code class="bash" style="max-height:none" data-line-numbers="2-16">$ datalad rerun 5a7565a640ff6de67
|
|
[INFO ] run commit 5a7565a640ff6de67; (Time series extraction from Locus Coeruleus)
|
|
[INFO ] Making sure inputs are available (this may take some time)
|
|
get(ok): mri/sub-01_bold.nii (file)
|
|
get(ok): mri/sub-02_bold.nii (file)
|
|
[...]
|
|
[INFO ] == Command start (output follows) =====
|
|
[INFO ] == Command exit (modification check follows) =====
|
|
add(ok): sub-01/LC_timeseries_run-*.csv(file)
|
|
add(ok): sub-02/LC_timeseries_run-*.csv (file)
|
|
[...]
|
|
action summary:
|
|
add (ok: 30)
|
|
get (ok: 30)
|
|
save (ok: 2)
|
|
unlock (ok: 30)
|
|
</code></pre>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
<section>
|
|
<h2>DataLad for scientific workflows?</h2>
|
|
<dl>
|
|
<dt class="fragment fade-in-then-semi-out" data-fragment-index="1">Scientific building blocks are not static.</dt>
|
|
<dd class="fragment fade-in-then-semi-out" data-fragment-index="2">Version control beyond text</dd>
|
|
<dt class="fragment fade-in-then-semi-out" data-fragment-index="3">Science is build from modular units.</dt>
|
|
<dd class="fragment fade-in-then-semi-out" data-fragment-index="4">Nesting</dd>
|
|
<dt class="fragment fade-in-then-semi-out" data-fragment-index="5">Science is exploratory, iterative, multi-stepped, and complex.</dt>
|
|
<dd class="fragment fade-in-then-semi-out" data-fragment-index="6">Provenance</dd>
|
|
<dt class="fragment fade-in-then-semi-out" data-fragment-index="7">Science is collaborative.</dt>
|
|
<dd class="fragment fade-in-then-semi-out" data-fragment-index="8">Transport logistics</dd>
|
|
</dl>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>Research data management is tied to reproducibility</h2>
|
|
<img src="../pics/fragile.png" height="800">
|
|
<imgcredit>Based on <a href="https://xkcd.com/2347/" target="_blank">
|
|
xkcd.com/2347/</a> (CC-BY)</imgcredit>
|
|
<small><a href="https://www.youtube.com/watch?v=nTVcMDVlyOI" target="_blank">
|
|
Reproducibility Management in Neuroscience -
|
|
Specific Issues and Solutions</a>
|
|
(<a href="https://doi.org/10.5281/zenodo.4285927" target="_blank">DOI 10.5281/zenodo.4285927</a>) </small>
|
|
</section>
|
|
</section>
|
|
|
|
<section>
|
|
<section data-markdown data-transition="None"><script type="text/template">
|
|
## FAIRly big: Scaling up
|
|
|
|
Objective: Process the UK Biobank (imaging data)
|
|
<!-- .element: height="400" -->
|
|
|
|
- 76 TB in 43 million files in total
|
|
- 42,715 participants contributed personal health data
|
|
- Strict DUA
|
|
- Custom binary-only downloader
|
|
- Most data records offered as (unversioned) ZIP files
|
|
</script></section>
|
|
|
|
<section data-markdown data-transition="None"><script type="text/template">
|
|
## Challenges
|
|
|
|
- Process data such that
|
|
- Results are computationally reproducible (without the original compute infrastructure)
|
|
- There is complete linkage from results to an individual data record download
|
|
- It scales with the amount of available compute resources
|
|
|
|
- Data processing pipeline
|
|
- Compiled MATLAB blob
|
|
- 1h processing time per image, with 41k images to process
|
|
- 1.2 M output files (30 output files per input file)
|
|
- 1.2 TB total size of outputs
|
|
</script></section>
|
|
|
|
<section data-transition="None">
|
|
<h2> FAIRly big setup</h2>
|
|
<img src="../pics/fairlybig_ukbsetup.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">
|
|
|
|
<ul style="font-size:30px">
|
|
<strong>Exhaustive tracking</strong>
|
|
<li><a href="https://github.com/datalad/datalad-ukbiobank" target="_blank">datalad-ukbiobank</a>
|
|
extension downloads, transforms & track the evolution of the complete data release
|
|
in DataLad datasets
|
|
</li>
|
|
<li>Native and BIDSified data layout (at no additional disk space usage)</li>
|
|
<li>Structured in 42k individual datasets, combined to one superdataset</li>
|
|
<li>Containerized pipeline in a software container</li>
|
|
<li>Link input data & computational pipeline as dependencies</li>
|
|
</ul>
|
|
<br><br>
|
|
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
|
|
Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
|
|
</small>
|
|
</section>
|
|
|
|
<section data-transition="None">
|
|
<h2>FAIRly big workflow</h2>
|
|
<div class="r-stack">
|
|
<img class="fragment fade-out" src="../pics/fairlybig_workflow.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">
|
|
<img src="../pics/htcondor.svg" class="fragment fade-in">
|
|
</div>
|
|
<br>
|
|
<ul style="font-size:30px">
|
|
<strong>portability</strong>
|
|
<li>Parallel processing: 1 job = 1 subject
|
|
(number of concurrent jobs capped at the capacity of the compute cluster)
|
|
</li>
|
|
<li>Each job is computed in a ephemeral (short-lived) dataset clone, results are pushed back:
|
|
Ensure exhaustive tracking &
|
|
portability during computation</li>
|
|
<li>Content-agnostic persistent (encrypted) storage (minimizing storage and inodes)</li>
|
|
<li>Common data representation in secure environments</li>
|
|
</ul>
|
|
<br><br>
|
|
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
|
|
Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
|
|
</small></section>
|
|
|
|
|
|
|
|
<section data-transition="None">
|
|
<h2>FAIRly big provenance capture</h2>
|
|
<img src="../pics/fairlybig_prov.png" width="1200" style="margin-top:-35px;margin-bottom:-30px">
|
|
<br><br>
|
|
<ul style="font-size:30px">
|
|
<strong>Provenance</strong>
|
|
<li>Every single pipeline execution is tracked</li>
|
|
<li>Execution in ephemeral workspaces ensures results
|
|
individually reproducible without HPC access</li>
|
|
</ul>
|
|
<br><br>
|
|
<small><a href="https://www.nature.com/articles/s41597-022-01163-2" target="_blank">
|
|
Wagner, Waite, Wierzba et al. (2021). FAIRly big: A framework for computationally reproducible processing of large-scale data.</a>
|
|
</small></section>
|
|
|
|
<section data-markdown><script type="text/template">
|
|
## FAIRly big movie
|
|
|
|
<iframe width="1120" height="630" src="https://www.youtube-nocookie.com/embed/UsW6xN2f2jc?start=17" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
|
|
|
- Two computations on clusters of different scale (small cluster, supercomputer). Full video: https://youtube.com/datalad
|
|
- Two full (re-)computations, programmatically comparable, verifiable, reproducible -- on any system with data access
|
|
</script></section>
|
|
</section>
|
|
|
|
<section>
|
|
|
|
<section>
|
|
<h2>Current and future developments</h2>
|
|
</section>
|
|
<!-- I've shown you this slide already - data changes, often due to mishaps from scientists.
|
|
But sometimes, its also political --->
|
|
<section data-transition="None">
|
|
|
|
<h3>The building blocks of a scientific result are <br>
|
|
... sometimes unreliable or threatened</h3>
|
|
<table>
|
|
<tr>
|
|
<div class="r-stack">
|
|
<p class="fragment fade-in-then-out" data-fragment-index="2">Mar 2019</p>
|
|
<!--<p class="fragment fade-in-then-out" data-fragment-index="3">Spring 2019</p>
|
|
<p class="fragment fade-in-then-out" data-fragment-index="4">July 2019</p>
|
|
<p class="fragment fade-in-then-out" data-fragment-index="5">Dec 2019</p>-->
|
|
<p class="fragment fade-in-then-out" data-fragment-index="3">Mar 2025 <br>
|
|
<small> <a href="https://www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification/" target="_blank">www.404media.co/nih-archives-repositories-marked-for-review-for-potential-modification</a> </small></p>
|
|
</div>
|
|
</tr>
|
|
<tr>
|
|
<div class="r-stack">
|
|
<img class="fragment fade-out" data-fragment-index="2" src="../pics/phd052810s.png" height="700">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="2" src="../pics/abcd_data_issues1.png">
|
|
<!-- <img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/abcd_data_issues2.png">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="4" src="../pics/abcd_data_issues3.png">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="5" src="../pics/abcd_data_issues4.png">
|
|
<img class="fragment fade-in-then-out" data-fragment-index="6" src="../pics/abcd_data_issues4.1.png">-->
|
|
<img class="fragment fade-in-then-out" data-fragment-index="3" src="../pics/nda_review.png">
|
|
</div>
|
|
<imgcredit class="fragment fade-out" data-fragment-index="1">Piled Higher and Deeper
|
|
<a href="https://phdcomics.com/comics/archive_print.php?comicid=1323" target="_blank">
|
|
1323
|
|
</a> </imgcredit></td>
|
|
|
|
</tr>
|
|
</table>
|
|
|
|
<div class="r-stack">
|
|
<p style="vertical-align:middle" class="fragment fade-in" data-fragment-index="3"><u>Data</u> changes <br>
|
|
<small>Due to presidential executive orders<br>
|
|
to remove files mentioning "gender"<br>
|
|
</small></p>
|
|
</div>
|
|
</section>
|
|
|
|
|
|
<!-- Previously, we told PhD students that decentralization saves them time
|
|
when -->
|
|
<section>
|
|
<h2>Freedom? Chose Decentralization</h2>
|
|
<ul>
|
|
<li>Infrastructure is ephemeral:</li>
|
|
<ul>
|
|
<li>Change of institutional contracts</li>
|
|
<li>Change of affiliations</li>
|
|
<li>Geopolitical developments?</li>
|
|
</ul>
|
|
<li>DataLad datasets are portable</li>
|
|
<ul>
|
|
<li>Effortless migrations to different Git or data hosting</li>
|
|
<li>Versioning allows for integrity checks</li>
|
|
</ul>
|
|
</ul>
|
|
<br><br><br>
|
|
<p >Delineation and advantages of decentral versus central RDM:<br><a href="https://doi.org/10.1515/nf-2020-0037" target="_blank">
|
|
Hanke et al., (2021). In defense of decentralized research data management</a></>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
<h2>Going self-hosted with forgejo-aneksajo</h2>
|
|
<ul>
|
|
<li>Forgejo (<a href="https://forgejo.org" target="_blank">forgejo.org</a>): Fork of Gitea</li>
|
|
<li class="fragment fade-in"><a href="https://codeberg.org/forgejo-aneksajo/forgejo-aneksajo" target="_blank">
|
|
Forgejo-aneksajo</a>: Forgejo with git-annex support</li>
|
|
</ul>
|
|
<div class="r-stack">
|
|
<img src="../pics/datalad-hub-frontpage.png">
|
|
<img class="fragment fade in" src="../pics/naturalistic-imaging-hub.png">
|
|
</div>
|
|
</section>
|
|
|
|
<section data-markdown data-transition="none"><script type="text/template">
|
|
### Full-stack RDM for independent, interoperable collaborators
|
|

|
|
|
|
<!-- .element: width="400" style="margin-top:-20px;margin-bottom:-10px" -->
|
|
|
|
|
|
scale-free organization: consortium, institution, lab, researcher
|
|
<div style="float:left;max-width:50%">
|
|
<ul>
|
|
<li>maximum contributor benefit</li>
|
|
<li>self-hostable, independently governed solutions, e.g.,
|
|
<a href="https://atris.fz-juelich.de" target="_blank">atris.fz-juelich.de</a>, <a href="https://hub.trr379.de" target="_blank">hub.trr379.de</a> </li>
|
|
</ul>
|
|
</div>
|
|
<div style="float:left;max-width:50%">
|
|
<ul>
|
|
<li>minimum contributor cost</li>
|
|
<li>self-contained contributor scopes, not inheriting complexity of others</li>
|
|
</ul>
|
|
</div>
|
|
</script></section>
|
|
|
|
<section>
|
|
<h2>Development Roadmap</h2>
|
|
<img src="../pics/roadmap_2025.png">
|
|
</section>
|
|
|
|
|
|
<section style="font-size:45px" data-transition="None" data-background-image="../pics/distribits-teaser-2025.svg"
|
|
data-background-size="1800px" data-background-opacity="0.2">
|
|
<h1>Join us!</h1>
|
|
<ul>
|
|
<strong>Distribits 2025</strong>
|
|
<li>International conference on technologies for distributed data management</li>
|
|
<li>2 day conference plus single-day Hackathon </li>
|
|
<li>@ Haus der Universität Düsseldorf</li>
|
|
<li>Registration open until May 1st</li>
|
|
</ul>
|
|
<br><br><br><br>
|
|
<h2><a href="https://distribits.live" target="_blank">distribits.live</a> </h2>
|
|
</section>
|
|
</section>
|
|
|
|
|
|
<section>
|
|
|
|
|
|
<section>
|
|
<h2>DataLad contact and more information</h2>
|
|
<table>
|
|
<tr><td>Website + Demos</td>
|
|
<td><a href="http://datalad.org">http://datalad.org</a></td>
|
|
</tr><tr><td>Documentation</td>
|
|
<td><a href="http://handbook.datalad.org">http://handbook.datalad.org</a></td>
|
|
</tr><tr><td>Talks and tutorials</td>
|
|
<td><a href="https://youtube.com/datalad">https://youtube.com/datalad</a></td>
|
|
</tr><tr><td>Development</td>
|
|
<td><a href="http://github.com/datalad">http://github.com/datalad</a></td>
|
|
</tr><tr><td>Support</td>
|
|
<td><a href="https://matrix.to/#/#datalad:matrix.org">https://matrix.to/#/#datalad:matrix.org</a></td>
|
|
</tr><tr><td>Open data</td>
|
|
<td><a href="http://datasets.datalad.org">http://datasets.datalad.org</a></td>
|
|
</tr>
|
|
</tr><tr><td>Mastodon</td>
|
|
<td>@datalad@fosstodon.org</td>
|
|
</tr>
|
|
</table>
|
|
</section>
|
|
<section data-markdown><script type="text/template">
|
|
## Extensive documentation and training materials
|
|
<!-- .element: width="700" style="margin-top:-20px;margin-bottom:-10px" -->
|
|
|
|
https://handbook.datalad.org (or ISBN 979-8857037973)
|
|
|
|
- **educational materials** on technologies — **targeting researchers**, not developers (executable paper, student surpervisor workflow,
|
|
...)
|
|
- handbook on concepts, workflows, and use cases
|
|
- **weekly public (virtual) office hour**
|
|
|
|
Note:
|
|
RDM Education is key. Handbook helps people be more productive, yielding more FAIR resources as an outcome, but not as the main goal.
|
|
</script></section>
|
|
|
|
</section>
|
|
|
|
<section>
|
|
<section>
|
|
<h1>Thanks!</h1>
|
|
<img src="../pics/qr_nhr.png" height="400px">
|
|
</section>
|
|
</section>
|
|
<section>
|
|
<!-- BACKUP -->
|
|
<section data-markdown><script type="text/template">
|
|
## Talk is cheap, show me the code: Git vs. DataLad
|
|
|
|
<iframe width="1120" height="630" src="https://www.youtube-nocookie.com/embed/Yrg6DgOcbPE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
|
|
|
|
https://www.youtube.com/watch?v=Yrg6DgOcbPE
|
|
|
|
<aside class="notes">
|
|
- show git limits: commit a change in a 3rd-level submodule
|
|
- show annex limits: get file in a subdataset
|
|
- reveal: datalad makes repo-boundaries vanish -- show save -r
|
|
</aside>
|
|
</script></section>
|
|
</section>
|
|
|
|
|
|
|
|
|
|
|
|
</div>
|
|
</div>
|
|
|
|
<script src="../reveal.js/dist/reveal.js"></script>
|
|
<script src="../reveal.js/plugin/notes/notes.js"></script>
|
|
<script src="../reveal.js/plugin/markdown/markdown.js"></script>
|
|
<script src="../reveal.js/plugin/highlight/highlight.js"></script>
|
|
<script>
|
|
// More info about initialization & config:
|
|
// - https://revealjs.com/initialization/
|
|
// - https://revealjs.com/config/
|
|
Reveal.initialize({
|
|
hash: true,
|
|
// The "normal" size of the presentation, aspect ratio will be preserved
|
|
// when the presentation is scaled to fit different resolutions. Can be
|
|
// specified using percentage units.
|
|
width: 1280,
|
|
height: 960,
|
|
// Factor of the display size that should remain empty around the content
|
|
margin: 0.3,
|
|
// Bounds for smallest/largest possible scale to apply to content
|
|
minScale: 0.2,
|
|
maxScale: 1.0,
|
|
|
|
controls: true,
|
|
progress: true,
|
|
history: true,
|
|
center: true,
|
|
slideNumber: 'c',
|
|
pdfSeparateFragments: true,
|
|
pdfMaxPagesPerSlide: 1,
|
|
pdfPageHeightOffset: -1,
|
|
transition: 'slide', // none/fade/slide/convex/concave/zoom
|
|
// Learn about plugins: https://revealjs.com/plugins/
|
|
plugins: [ RevealMarkdown, RevealHighlight, RevealNotes ]
|
|
});
|
|
</script>
|
|
</body>
|
|
</html>
|