Compare regions between datasets on the synaptic level

Goals

How to compute region-based summaries of synaptic data for identified neurons in two datasets and compare between the two

Datasets

We will be using two fruitfly ventral nerve cord datasets for the following analysis, namely FANC (fruitfly_fanc_public) and MANC (fruitfly_manc).

Data Model

In both the MANC and FANC dataset, we are provided with a synapse_region.parquet file that links each predicted synaptic location to a region. The region labels is available in the region columns

Code

First, we load DuckDB, create a connection and set the two baseurls

import json
import duckdb
con = duckdb.connect()

baseurl_fanc = 'https://api.braincircuits.io/data/fruitfly_fanc_public'
baseurl_manc = 'https://api.braincircuits.io/data/fruitfly_manc'

We pick two homologous descending neurons on the left and right side that are matched between the datasets. We produce a string variable where we concatenated the segment IDs for the subsequent queries.

segments_fanc = [648518346492614075, 648518346478550356]
segments_manc = [10118, 10126]

segids_fanc = ','.join(map(str, segments_fanc))
segids_manc = ','.join(map(str, segments_manc))

Let’s get the basic information for those neurons.

df = con.query(f"""SELECT * 
               FROM '{baseurl_fanc}/neurons.parquet' 
               WHERE segment_id in ({segids_fanc})""").df()

print(json.dumps(df.to_dict(orient='records'), indent=2))

[
  {
    "cell_id": 11162,
    "x": 45064,
    "y": 75200,
    "z": 1294,
    "cell_id_type": "neck connective",
    "sv_id": 73608180982162608,
    "segment_id": 648518346492614075,
    "label": "anterior-posterior projection pattern; central neuron; descending; neck connective (right); primary class"
  },
  {
    "cell_id": 10178,
    "x": 34253,
    "y": 75200,
    "z": 1456,
    "cell_id_type": "neck connective",
    "sv_id": 73185968584372736,
    "segment_id": 648518346478550356,
    "label": "Lesser Azevedo et al. 2023; anterior-posterior projection pattern; central neuron; descending; neck connective (left); primary class; publication; uk"
  }
]

df = con.query(f"""SELECT * 
               FROM '{baseurl_manc}/neurons.parquet' 
               WHERE segment_id in ({segids_manc})""").df()

print(json.dumps(df.to_dict(orient='records'), indent=2))

[
  {
    "segment_id": 10118,
    "label": "TBD",
    "celltype": "DNa02",
    "nr_downstream_partner": 20392,
    "nr_post": 886,
    "nr_pre": 2566,
    "nr_upstream_partner": 886,
    "birthtime": null,
    "class": "descending neuron",
    "entryNerve": "CvC",
    "exitNerve": "None",
    "group": 10118.0,
    "hemilineage": "None",
    "longTract": "ITD",
    "modality": null,
    "origin": "brain",
    "positionType": "user",
    "predictedNt": "acetylcholine",
    "rootSide": "RHS",
    "serial": NaN,
    "serialMotif": null,
    "size": 6179155956,
    "somaNeuromere": null,
    "somaSide": null,
    "subclass": "xl",
    "subcluster": NaN,
    "synonyms": null,
    "systematicType": "DNxl002",
    "tag": null,
    "target": "LegNp_R",
    "transmission": null
  },
  {
    "segment_id": 10126,
    "label": "TBD",
    "celltype": "DNa02",
    "nr_downstream_partner": 20026,
    "nr_post": 880,
    "nr_pre": 2509,
    "nr_upstream_partner": 880,
    "birthtime": null,
    "class": "descending neuron",
    "entryNerve": "CvC",
    "exitNerve": "None",
    "group": 10118.0,
    "hemilineage": "None",
    "longTract": "ITD",
    "modality": null,
    "origin": "brain",
    "positionType": "user",
    "predictedNt": "acetylcholine",
    "rootSide": "LHS",
    "serial": NaN,
    "serialMotif": null,
    "size": 6459395030,
    "somaNeuromere": null,
    "somaSide": null,
    "subclass": "xl",
    "subcluster": NaN,
    "synonyms": null,
    "systematicType": "DNxl002",
    "tag": null,
    "target": "LegNp_L",
    "transmission": null
  }
]

Before we go into retrieving more details about the neurons, let’s get an overview of the available regions and their synaptic link count in both datasets.

For FANC

con.execute(f"""select region, count() as count 
    from '{baseurl_fanc}/synapse_link.parquet' 
    group by region order by region;""").df()

	region	count
0		25874
1	AMNp_L	6991
2	AMNp_R	1504
3	ANm_L	3390
4	ANm_R	3072
5	CFF_L	599
6	CFF_R	37
7	DLT_L	3181
8	DLT_R	1012
9	DLV_L	287
10	DLV_R	92
11	DMT_L	4695
12	DMT_R	5210
13	HTct_L	73846
14	HTct_R	28179
15	ITD_HC_L	345
16	ITD_HC_R	45
17	ITD_HT_L	2653
18	ITD_HT_R	196
19	ITD_L	3601
20	ITD_R	340
21	IntTct_L	46649
22	IntTct_R	20328
23	LTct_L	9885
24	LTct_R	8490
25	MDT_L	4276
26	MDT_R	2969
27	MesoNm_L	7065
28	MesoNm_R	1405
29	MetaNm_L	5071
30	MetaNm_R	3316
31	NTct_L	12177
32	NTct_R	3352
33	ProNm_L	15214
34	ProNm_R	5211
35	VLT_L	1451
36	VLT_R	460
37	VTV_L	109
38	VTV_R	112
39	WTct_L	203787
40	WTct_R	94022
41	mVAC_L	15
42	mVAC_R	8

And in MANC

con.execute(f"""select region, count() as count 
    from '{baseurl_manc}/synapse_link.parquet' 
    group by region order by region;""").df()

	region	count
0	<unspecified>	28602
1	ADMN(L)	1460
2	ADMN(R)	1569
3	ANm	3745214
4	AbN1(L)	2
5	AbN1(R)	4
6	AbN2(L)	39
7	AbN2(R)	30
8	AbN3(L)	51
9	AbN3(R)	75
10	AbN4(L)	278
11	AbN4(R)	117
12	AbNT	144
13	CV	8772
14	CvN(L)	28
15	DMetaN(L)	539
16	DMetaN(R)	364
17	DProN(L)	19
18	DProN(R)	442
19	HTct(UTct-T3)(L)	818354
20	HTct(UTct-T3)(R)	832300
21	IntTct	1266501
22	LTct	1214795
23	LegNp(T1)(L)	3169291
24	LegNp(T1)(R)	3347120
25	LegNp(T2)(L)	3148099
26	LegNp(T2)(R)	3635004
27	LegNp(T3)(L)	3252860
28	LegNp(T3)(R)	3780774
29	MesoAN(L)	8
30	MesoAN(R)	31
31	MesoLN(L)	184
32	MesoLN(R)	394
33	MetaLN(L)	236
34	MetaLN(R)	198
35	NTct(UTct-T1)(L)	272896
36	NTct(UTct-T1)(R)	288028
37	Ov(L)	841866
38	Ov(R)	865094
39	PDMN(L)	942
40	PDMN(R)	959
41	PrN(L)	4
42	PrN(R)	218
43	ProAN(L)	7
44	ProAN(R)	71
45	ProCN(L)	2
46	ProCN(R)	158
47	ProLN(L)	334
48	ProLN(R)	828
49	VProN(L)	88
50	VProN(R)	38
51	WTct(UTct-T2)(L)	970855
52	WTct(UTct-T2)(R)	920499
53	mVAC(T1)(L)	99067
54	mVAC(T1)(R)	127890
55	mVAC(T2)(L)	72420
56	mVAC(T2)(R)	95405
57	mVAC(T3)(L)	127490
58	mVAC(T3)(R)	138634

We see that the nomenclature for the brain regions are different in both datasets. In this table you can find the region, tract, nerve and connective correspondences contributed by Kathi Eichler.

We can now query the synapse_link.parquet table which constains a region column for each dataset to retrieve their information. We’re interested here in the regions where the presynaptic sites are located, i.e. the output region of the neuron.

df = con.query(f"""SELECT pre_segment_id, post_segment_id, region 
               FROM '{baseurl_fanc}/synapse_link.parquet' 
               WHERE pre_segment_id in ({segids_fanc})""").df()
df

	pre_segment_id	post_segment_id	region
0	648518346492614075	648518346481564097	ProNm_R
1	648518346478550356	648518346480882144	NTct_L
2	648518346478550356	648518346480882144	NTct_L
3	648518346478550356	648518346478550356	ProNm_L
4	648518346478550356	648518346487504531	ProNm_L
...	...	...	...
1178	648518346478550356	648518346480882144	WTct_L
1179	648518346478550356	648518346471876251	WTct_L
1180	648518346478550356	648518346471876251	WTct_L
1181	648518346492614075	648518346494780554	NTct_R
1182	648518346492614075	648518346494780554

1183 rows × 3 columns

And for MANC

df = con.query(f"""SELECT pre_segment_id, post_segment_id, region
               FROM '{baseurl_manc}/synapse_link.parquet' 
               WHERE pre_segment_id in ({segids_manc})""").df()
df

	pre_segment_id	post_segment_id	region
0	10118	26238	LegNp(T3)(R)
1	10118	21649	LegNp(T3)(R)
2	10118	18842	LegNp(T3)(R)
3	10118	10968	LegNp(T3)(R)
4	10118	26238	LegNp(T3)(R)
...	...	...	...
13763	10118	159083	CV
13764	10126	10829	CV
13765	10126	29930	CV
13766	10118	20352	CV
13767	10126	10829	CV

13768 rows × 3 columns

Next, we like to get the total count of presynaptic location for each segment and region. We can easily do that byy using the group by in the SQL statement:

con.query(f"""SELECT pre_segment_id, region, count(*) as count
               FROM '{baseurl_fanc}/synapse_link.parquet' 
               WHERE pre_segment_id in ({segids_fanc}) \
               GROUP BY region, pre_segment_id 
               ORDER BY pre_segment_id asc, count desc""").df()

	pre_segment_id	region	count
0	648518346478550356	HTct_L	221
1	648518346478550356	ProNm_L	111
2	648518346478550356	NTct_L	63
3	648518346478550356	WTct_L	59
4	648518346478550356	IntTct_L	42
5	648518346478550356	MesoNm_L	39
6	648518346478550356	ITD_L	16
7	648518346478550356	ITD_HT_L	13
8	648518346478550356	ITD_HC_L	7
9	648518346478550356	MetaNm_L	7
10	648518346478550356		5
11	648518346478550356	MDT_L	2
12	648518346478550356	VLT_L	1
13	648518346478550356	ANm_L	1
14	648518346492614075	HTct_R	303
15	648518346492614075	NTct_R	87
16	648518346492614075	IntTct_R	48
17	648518346492614075	ProNm_R	48
18	648518346492614075	WTct_R	26
19	648518346492614075	ITD_R	22
20	648518346492614075	MDT_R	16
21	648518346492614075	MesoNm_R	13
22	648518346492614075	MetaNm_R	12
23	648518346492614075	ITD_HC_R	9
24	648518346492614075	DMT_R	5
25	648518346492614075		4
26	648518346492614075	ANm_R	1
27	648518346492614075	ITD_HT_R	1
28	648518346492614075	VLT_R	1

And for MANC

con.query(f"""SELECT pre_segment_id, region, count(*) as count
               FROM '{baseurl_manc}/synapse_link.parquet' 
               WHERE pre_segment_id in ({segids_manc}) 
               GROUP BY region, pre_segment_id 
               ORDER BY pre_segment_id asc, count desc""").df()

	pre_segment_id	region	count
0	10118	LegNp(T1)(R)	2455
1	10118	LegNp(T3)(R)	1470
2	10118	LegNp(T2)(R)	1439
3	10118	HTct(UTct-T3)(R)	679
4	10118	IntTct	449
5	10118	NTct(UTct-T1)(R)	448
6	10118	ANm	102
7	10118	WTct(UTct-T2)(R)	70
8	10118	CV	6
9	10118	<unspecified>	1
10	10126	LegNp(T1)(L)	2424
11	10126	LegNp(T3)(L)	1483
12	10126	LegNp(T2)(L)	1282
13	10126	HTct(UTct-T3)(L)	684
14	10126	IntTct	318
15	10126	NTct(UTct-T1)(L)	304
16	10126	ANm	146
17	10126	WTct(UTct-T2)(L)	4
18	10126	CV	3
19	10126	<unspecified>	1