import json
import duckdb
= duckdb.connect()
con
= 'https://api.braincircuits.io/data/fruitfly_fanc_public'
baseurl_fanc = 'https://api.braincircuits.io/data/fruitfly_manc' baseurl_manc
Goals
How to compute region-based summaries of synaptic data for identified neurons in two datasets and compare between the two
Datasets
We will be using two fruitfly ventral nerve cord datasets for the following analysis, namely FANC (fruitfly_fanc_public
) and MANC (fruitfly_manc
).
Data Model
In both the MANC and FANC dataset, we are provided with a synapse_region.parquet
file that links each predicted synaptic location to a region. The region labels is available in the region
columns
Code
First, we load DuckDB, create a connection and set the two baseurls
We pick two homologous descending neurons on the left and right side that are matched between the datasets. We produce a string variable where we concatenated the segment IDs for the subsequent queries.
= [648518346492614075, 648518346478550356]
segments_fanc = [10118, 10126]
segments_manc
= ','.join(map(str, segments_fanc))
segids_fanc = ','.join(map(str, segments_manc)) segids_manc
Let’s get the basic information for those neurons.
= con.query(f"""SELECT *
df FROM '{baseurl_fanc}/neurons.parquet'
WHERE segment_id in ({segids_fanc})""").df()
print(json.dumps(df.to_dict(orient='records'), indent=2))
[
{
"cell_id": 11162,
"x": 45064,
"y": 75200,
"z": 1294,
"cell_id_type": "neck connective",
"sv_id": 73608180982162608,
"segment_id": 648518346492614075,
"label": "anterior-posterior projection pattern; central neuron; descending; neck connective (right); primary class"
},
{
"cell_id": 10178,
"x": 34253,
"y": 75200,
"z": 1456,
"cell_id_type": "neck connective",
"sv_id": 73185968584372736,
"segment_id": 648518346478550356,
"label": "Lesser Azevedo et al. 2023; anterior-posterior projection pattern; central neuron; descending; neck connective (left); primary class; publication; uk"
}
]
= con.query(f"""SELECT *
df FROM '{baseurl_manc}/neurons.parquet'
WHERE segment_id in ({segids_manc})""").df()
print(json.dumps(df.to_dict(orient='records'), indent=2))
[
{
"segment_id": 10118,
"label": "TBD",
"celltype": "DNa02",
"nr_downstream_partner": 20392,
"nr_post": 886,
"nr_pre": 2566,
"nr_upstream_partner": 886,
"birthtime": null,
"class": "descending neuron",
"entryNerve": "CvC",
"exitNerve": "None",
"group": 10118.0,
"hemilineage": "None",
"longTract": "ITD",
"modality": null,
"origin": "brain",
"positionType": "user",
"predictedNt": "acetylcholine",
"rootSide": "RHS",
"serial": NaN,
"serialMotif": null,
"size": 6179155956,
"somaNeuromere": null,
"somaSide": null,
"subclass": "xl",
"subcluster": NaN,
"synonyms": null,
"systematicType": "DNxl002",
"tag": null,
"target": "LegNp_R",
"transmission": null
},
{
"segment_id": 10126,
"label": "TBD",
"celltype": "DNa02",
"nr_downstream_partner": 20026,
"nr_post": 880,
"nr_pre": 2509,
"nr_upstream_partner": 880,
"birthtime": null,
"class": "descending neuron",
"entryNerve": "CvC",
"exitNerve": "None",
"group": 10118.0,
"hemilineage": "None",
"longTract": "ITD",
"modality": null,
"origin": "brain",
"positionType": "user",
"predictedNt": "acetylcholine",
"rootSide": "LHS",
"serial": NaN,
"serialMotif": null,
"size": 6459395030,
"somaNeuromere": null,
"somaSide": null,
"subclass": "xl",
"subcluster": NaN,
"synonyms": null,
"systematicType": "DNxl002",
"tag": null,
"target": "LegNp_L",
"transmission": null
}
]
Before we go into retrieving more details about the neurons, let’s get an overview of the available regions and their synaptic link count in both datasets.
For FANC
f"""select region, count() as count
con.execute( from '{baseurl_fanc}/synapse_link.parquet'
group by region order by region;""").df()
region | count | |
---|---|---|
0 | 25874 | |
1 | AMNp_L | 6991 |
2 | AMNp_R | 1504 |
3 | ANm_L | 3390 |
4 | ANm_R | 3072 |
5 | CFF_L | 599 |
6 | CFF_R | 37 |
7 | DLT_L | 3181 |
8 | DLT_R | 1012 |
9 | DLV_L | 287 |
10 | DLV_R | 92 |
11 | DMT_L | 4695 |
12 | DMT_R | 5210 |
13 | HTct_L | 73846 |
14 | HTct_R | 28179 |
15 | ITD_HC_L | 345 |
16 | ITD_HC_R | 45 |
17 | ITD_HT_L | 2653 |
18 | ITD_HT_R | 196 |
19 | ITD_L | 3601 |
20 | ITD_R | 340 |
21 | IntTct_L | 46649 |
22 | IntTct_R | 20328 |
23 | LTct_L | 9885 |
24 | LTct_R | 8490 |
25 | MDT_L | 4276 |
26 | MDT_R | 2969 |
27 | MesoNm_L | 7065 |
28 | MesoNm_R | 1405 |
29 | MetaNm_L | 5071 |
30 | MetaNm_R | 3316 |
31 | NTct_L | 12177 |
32 | NTct_R | 3352 |
33 | ProNm_L | 15214 |
34 | ProNm_R | 5211 |
35 | VLT_L | 1451 |
36 | VLT_R | 460 |
37 | VTV_L | 109 |
38 | VTV_R | 112 |
39 | WTct_L | 203787 |
40 | WTct_R | 94022 |
41 | mVAC_L | 15 |
42 | mVAC_R | 8 |
And in MANC
f"""select region, count() as count
con.execute( from '{baseurl_manc}/synapse_link.parquet'
group by region order by region;""").df()
region | count | |
---|---|---|
0 | <unspecified> | 28602 |
1 | ADMN(L) | 1460 |
2 | ADMN(R) | 1569 |
3 | ANm | 3745214 |
4 | AbN1(L) | 2 |
5 | AbN1(R) | 4 |
6 | AbN2(L) | 39 |
7 | AbN2(R) | 30 |
8 | AbN3(L) | 51 |
9 | AbN3(R) | 75 |
10 | AbN4(L) | 278 |
11 | AbN4(R) | 117 |
12 | AbNT | 144 |
13 | CV | 8772 |
14 | CvN(L) | 28 |
15 | DMetaN(L) | 539 |
16 | DMetaN(R) | 364 |
17 | DProN(L) | 19 |
18 | DProN(R) | 442 |
19 | HTct(UTct-T3)(L) | 818354 |
20 | HTct(UTct-T3)(R) | 832300 |
21 | IntTct | 1266501 |
22 | LTct | 1214795 |
23 | LegNp(T1)(L) | 3169291 |
24 | LegNp(T1)(R) | 3347120 |
25 | LegNp(T2)(L) | 3148099 |
26 | LegNp(T2)(R) | 3635004 |
27 | LegNp(T3)(L) | 3252860 |
28 | LegNp(T3)(R) | 3780774 |
29 | MesoAN(L) | 8 |
30 | MesoAN(R) | 31 |
31 | MesoLN(L) | 184 |
32 | MesoLN(R) | 394 |
33 | MetaLN(L) | 236 |
34 | MetaLN(R) | 198 |
35 | NTct(UTct-T1)(L) | 272896 |
36 | NTct(UTct-T1)(R) | 288028 |
37 | Ov(L) | 841866 |
38 | Ov(R) | 865094 |
39 | PDMN(L) | 942 |
40 | PDMN(R) | 959 |
41 | PrN(L) | 4 |
42 | PrN(R) | 218 |
43 | ProAN(L) | 7 |
44 | ProAN(R) | 71 |
45 | ProCN(L) | 2 |
46 | ProCN(R) | 158 |
47 | ProLN(L) | 334 |
48 | ProLN(R) | 828 |
49 | VProN(L) | 88 |
50 | VProN(R) | 38 |
51 | WTct(UTct-T2)(L) | 970855 |
52 | WTct(UTct-T2)(R) | 920499 |
53 | mVAC(T1)(L) | 99067 |
54 | mVAC(T1)(R) | 127890 |
55 | mVAC(T2)(L) | 72420 |
56 | mVAC(T2)(R) | 95405 |
57 | mVAC(T3)(L) | 127490 |
58 | mVAC(T3)(R) | 138634 |
We see that the nomenclature for the brain regions are different in both datasets. In this table you can find the region, tract, nerve and connective correspondences contributed by Kathi Eichler.
We can now query the synapse_link.parquet
table which constains a region
column for each dataset to retrieve their information. We’re interested here in the regions where the presynaptic sites are located, i.e. the output region of the neuron.
= con.query(f"""SELECT pre_segment_id, post_segment_id, region
df FROM '{baseurl_fanc}/synapse_link.parquet'
WHERE pre_segment_id in ({segids_fanc})""").df()
df
pre_segment_id | post_segment_id | region | |
---|---|---|---|
0 | 648518346492614075 | 648518346481564097 | ProNm_R |
1 | 648518346478550356 | 648518346480882144 | NTct_L |
2 | 648518346478550356 | 648518346480882144 | NTct_L |
3 | 648518346478550356 | 648518346478550356 | ProNm_L |
4 | 648518346478550356 | 648518346487504531 | ProNm_L |
... | ... | ... | ... |
1178 | 648518346478550356 | 648518346480882144 | WTct_L |
1179 | 648518346478550356 | 648518346471876251 | WTct_L |
1180 | 648518346478550356 | 648518346471876251 | WTct_L |
1181 | 648518346492614075 | 648518346494780554 | NTct_R |
1182 | 648518346492614075 | 648518346494780554 |
1183 rows × 3 columns
And for MANC
= con.query(f"""SELECT pre_segment_id, post_segment_id, region
df FROM '{baseurl_manc}/synapse_link.parquet'
WHERE pre_segment_id in ({segids_manc})""").df()
df
pre_segment_id | post_segment_id | region | |
---|---|---|---|
0 | 10118 | 26238 | LegNp(T3)(R) |
1 | 10118 | 21649 | LegNp(T3)(R) |
2 | 10118 | 18842 | LegNp(T3)(R) |
3 | 10118 | 10968 | LegNp(T3)(R) |
4 | 10118 | 26238 | LegNp(T3)(R) |
... | ... | ... | ... |
13763 | 10118 | 159083 | CV |
13764 | 10126 | 10829 | CV |
13765 | 10126 | 29930 | CV |
13766 | 10118 | 20352 | CV |
13767 | 10126 | 10829 | CV |
13768 rows × 3 columns
Next, we like to get the total count of presynaptic location for each segment and region. We can easily do that byy using the group by
in the SQL statement:
f"""SELECT pre_segment_id, region, count(*) as count
con.query( FROM '{baseurl_fanc}/synapse_link.parquet'
WHERE pre_segment_id in ({segids_fanc}) \
GROUP BY region, pre_segment_id
ORDER BY pre_segment_id asc, count desc""").df()
pre_segment_id | region | count | |
---|---|---|---|
0 | 648518346478550356 | HTct_L | 221 |
1 | 648518346478550356 | ProNm_L | 111 |
2 | 648518346478550356 | NTct_L | 63 |
3 | 648518346478550356 | WTct_L | 59 |
4 | 648518346478550356 | IntTct_L | 42 |
5 | 648518346478550356 | MesoNm_L | 39 |
6 | 648518346478550356 | ITD_L | 16 |
7 | 648518346478550356 | ITD_HT_L | 13 |
8 | 648518346478550356 | ITD_HC_L | 7 |
9 | 648518346478550356 | MetaNm_L | 7 |
10 | 648518346478550356 | 5 | |
11 | 648518346478550356 | MDT_L | 2 |
12 | 648518346478550356 | VLT_L | 1 |
13 | 648518346478550356 | ANm_L | 1 |
14 | 648518346492614075 | HTct_R | 303 |
15 | 648518346492614075 | NTct_R | 87 |
16 | 648518346492614075 | IntTct_R | 48 |
17 | 648518346492614075 | ProNm_R | 48 |
18 | 648518346492614075 | WTct_R | 26 |
19 | 648518346492614075 | ITD_R | 22 |
20 | 648518346492614075 | MDT_R | 16 |
21 | 648518346492614075 | MesoNm_R | 13 |
22 | 648518346492614075 | MetaNm_R | 12 |
23 | 648518346492614075 | ITD_HC_R | 9 |
24 | 648518346492614075 | DMT_R | 5 |
25 | 648518346492614075 | 4 | |
26 | 648518346492614075 | ANm_R | 1 |
27 | 648518346492614075 | ITD_HT_R | 1 |
28 | 648518346492614075 | VLT_R | 1 |
And for MANC
f"""SELECT pre_segment_id, region, count(*) as count
con.query( FROM '{baseurl_manc}/synapse_link.parquet'
WHERE pre_segment_id in ({segids_manc})
GROUP BY region, pre_segment_id
ORDER BY pre_segment_id asc, count desc""").df()
pre_segment_id | region | count | |
---|---|---|---|
0 | 10118 | LegNp(T1)(R) | 2455 |
1 | 10118 | LegNp(T3)(R) | 1470 |
2 | 10118 | LegNp(T2)(R) | 1439 |
3 | 10118 | HTct(UTct-T3)(R) | 679 |
4 | 10118 | IntTct | 449 |
5 | 10118 | NTct(UTct-T1)(R) | 448 |
6 | 10118 | ANm | 102 |
7 | 10118 | WTct(UTct-T2)(R) | 70 |
8 | 10118 | CV | 6 |
9 | 10118 | <unspecified> | 1 |
10 | 10126 | LegNp(T1)(L) | 2424 |
11 | 10126 | LegNp(T3)(L) | 1483 |
12 | 10126 | LegNp(T2)(L) | 1282 |
13 | 10126 | HTct(UTct-T3)(L) | 684 |
14 | 10126 | IntTct | 318 |
15 | 10126 | NTct(UTct-T1)(L) | 304 |
16 | 10126 | ANm | 146 |
17 | 10126 | WTct(UTct-T2)(L) | 4 |
18 | 10126 | CV | 3 |
19 | 10126 | <unspecified> | 1 |