Skip to content

Commit 6c17628

Browse files
committed
update
1 parent 7edf396 commit 6c17628

129 files changed

Lines changed: 2272 additions & 496 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/publish.yaml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: Publish to PyPI
2+
3+
on:
4+
push:
5+
tags:
6+
- "v*.*.*"
7+
8+
jobs:
9+
build-and-publish:
10+
runs-on: ubuntu-latest
11+
12+
steps:
13+
- name: Checkout source
14+
uses: actions/checkout@v4
15+
with:
16+
fetch-depth: 0
17+
18+
- name: Set up Python
19+
uses: actions/setup-python@v5
20+
with:
21+
python-version: "3.10"
22+
23+
- name: Install build tooling
24+
run: pip install build
25+
26+
- name: Build wheel and sdist with setuptools
27+
run: python -m build
28+
29+
- name: Publish to PyPI
30+
uses: pypa/gh-action-pypi-publish@release/v1
31+
with:
32+
password: ${{ secrets.PYPI_PUBLISH_TOKEN }}

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,13 @@ venv/
22
__pycache__/
33
.*
44
!.gitignore
5+
!.github/
56
graveyard/
67
profile/
78
lightning_logs/
89
todo.md
910
debug.py
1011
print_profile.py
1112
install.txt
12-
profile.txt
13+
profile.txt
14+
bioverse.egg-info/

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The Bioverse

bioverse/adapters/beta_lactam.py

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
import random
2+
import struct
3+
import zlib
4+
5+
import awkward as ak
6+
import numpy as np
7+
import pandas as pd
8+
from rdkit import Chem
9+
10+
from ..adapter import Adapter
11+
from ..data import Assets, Split
12+
from ..utilities import IteratorWithLength, batched, config, download
13+
14+
15+
class BetaLactamAdapter(Adapter):
16+
17+
@classmethod
18+
def download(cls):
19+
from chembl_webresource_client.new_client import new_client
20+
21+
molecules = fetch_antibiotic_molecules()
22+
23+
assert (
24+
len(molecules) > 0
25+
), "No molecules retrieved. Check network connectivity or ChEMBL availability."
26+
27+
pos, neg = split_by_beta_lactam(molecules)
28+
rng = np.random.default_rng(0)
29+
rng.shuffle(pos)
30+
rng.shuffle(neg)
31+
32+
total = len(pos) + len(neg)
33+
n_val, n_test = total // 20, total // 20
34+
n_train_pos, n_train_neg = len(pos) - n_val - n_test, len(neg) - n_val - n_test
35+
36+
df = make_df(pos, neg)
37+
38+
split = np.array(
39+
[[1]] * n_val
40+
+ [[2]] * n_test
41+
+ [[0]] * n_train_pos
42+
+ [[1]] * n_val
43+
+ [[2]] * n_test
44+
+ [[0]] * n_train_neg
45+
)
46+
perm = np.random.permutation(len(split))
47+
split, df = split[perm], df.iloc[perm]
48+
49+
# create records
50+
def generator():
51+
for smiles, id, name, beta_lactam in zip(
52+
df["smiles"], df["chembl_id"], df["name"], df["beta_lactam"]
53+
):
54+
data = {
55+
"molecule_smiles": [[smiles]],
56+
"molecule_id": [[id]],
57+
"molecule_name": [[name]],
58+
"molecule_label": [[beta_lactam]],
59+
}
60+
yield ak.Record(data)
61+
62+
batches = batched(IteratorWithLength(generator(), len(df)))
63+
return batches, Split(split, names=["train", "val", "test"]), Assets({})
64+
65+
66+
def iter_unique_molecules(records):
67+
seen: set = set()
68+
for rec in records:
69+
chembl_id = rec.get("molecule_chembl_id")
70+
if not chembl_id:
71+
continue
72+
if chembl_id in seen:
73+
continue
74+
seen.add(chembl_id)
75+
yield rec
76+
77+
78+
def fetch_antibiotic_molecules():
79+
"""
80+
Fetch molecules classified as antibiotics from ChEMBL using multiple strategies:
81+
1. ATC J01 (Antibacterials for systemic use)
82+
2. Molecules with synonyms containing antibiotic-related terms
83+
3. Molecules with mechanism of action related to antibiotics
84+
4. Molecules indicated for bacterial infections
85+
"""
86+
from chembl_webresource_client.new_client import new_client
87+
88+
molecule = new_client.molecule
89+
atc = new_client.atc_class
90+
fields = [
91+
"molecule_chembl_id",
92+
"pref_name",
93+
"atc_classifications",
94+
"molecule_structures",
95+
]
96+
results = []
97+
seen_mol_ids = set()
98+
99+
def add_molecules(mol_list):
100+
"""Helper to add molecules while tracking seen IDs."""
101+
for rec in mol_list:
102+
mol_id = rec.get("molecule_chembl_id")
103+
if mol_id and mol_id not in seen_mol_ids:
104+
seen_mol_ids.add(mol_id)
105+
results.append(rec)
106+
107+
# Strategy 1: ATC J01 (Antibacterials for systemic use)
108+
try:
109+
level5_codes = [
110+
rec["level5"] for rec in atc.filter(level2="J01") if rec.get("level5")
111+
]
112+
for code in level5_codes:
113+
try:
114+
res = molecule.filter(
115+
atc_classifications=code,
116+
molecule_structures__isnull=False,
117+
).only(fields)
118+
add_molecules(list(res))
119+
except Exception:
120+
continue
121+
except Exception:
122+
pass
123+
124+
# Strategy 2: Molecules with synonyms containing antibiotic-related terms
125+
synonym_terms = [
126+
"antibiotic",
127+
"antibacterial",
128+
"antimicrobial",
129+
"bactericidal",
130+
"bacteriostatic",
131+
]
132+
try:
133+
synonym = new_client.molecule_synonym
134+
synonym_mol_ids = set()
135+
for term in synonym_terms:
136+
try:
137+
synonym_results = synonym.filter(synonym__icontains=term)
138+
for syn_rec in synonym_results:
139+
mol_id = syn_rec.get("molecule_chembl_id")
140+
if mol_id:
141+
synonym_mol_ids.add(mol_id)
142+
except Exception:
143+
continue
144+
145+
# Fetch full molecule records for synonym matches
146+
for mol_id in synonym_mol_ids:
147+
if mol_id in seen_mol_ids:
148+
continue
149+
try:
150+
mol_rec = molecule.filter(
151+
molecule_chembl_id=mol_id,
152+
molecule_structures__isnull=False,
153+
).only(fields)
154+
add_molecules(list(mol_rec))
155+
except Exception:
156+
continue
157+
except Exception:
158+
pass
159+
160+
# Strategy 3: Molecules with mechanism of action related to antibiotics
161+
try:
162+
mechanism = new_client.mechanism
163+
# Search for mechanisms with antibiotic-related terms
164+
moa_terms = ["antibiotic", "antibacterial", "bacterial", "bactericidal"]
165+
mechanism_mol_ids = set()
166+
for term in moa_terms:
167+
try:
168+
mech_results = mechanism.filter(mechanism_of_action__icontains=term)
169+
for mech_rec in mech_results:
170+
mol_id = mech_rec.get("molecule_chembl_id")
171+
if mol_id:
172+
mechanism_mol_ids.add(mol_id)
173+
except Exception:
174+
continue
175+
176+
# Fetch full molecule records
177+
for mol_id in mechanism_mol_ids:
178+
if mol_id in seen_mol_ids:
179+
continue
180+
try:
181+
mol_rec = molecule.filter(
182+
molecule_chembl_id=mol_id,
183+
molecule_structures__isnull=False,
184+
).only(fields)
185+
add_molecules(list(mol_rec))
186+
except Exception:
187+
continue
188+
except Exception:
189+
pass
190+
191+
# Strategy 4: Molecules indicated for bacterial infections
192+
try:
193+
indication = new_client.drug_indication
194+
# Search for indications related to bacterial infections
195+
indication_terms = ["bacterial infection", "bacteremia", "sepsis", "pneumonia"]
196+
indication_mol_ids = set()
197+
for term in indication_terms:
198+
try:
199+
ind_results = indication.filter(efo_term__icontains=term)
200+
for ind_rec in ind_results:
201+
mol_id = ind_rec.get("molecule_chembl_id")
202+
if mol_id:
203+
indication_mol_ids.add(mol_id)
204+
except Exception:
205+
continue
206+
207+
# Fetch full molecule records
208+
for mol_id in indication_mol_ids:
209+
if mol_id in seen_mol_ids:
210+
continue
211+
try:
212+
mol_rec = molecule.filter(
213+
molecule_chembl_id=mol_id,
214+
molecule_structures__isnull=False,
215+
).only(fields)
216+
add_molecules(list(mol_rec))
217+
except Exception:
218+
continue
219+
except Exception:
220+
pass
221+
222+
# Keep only those with a canonical SMILES
223+
filtered = []
224+
for rec in results:
225+
structs = rec.get("molecule_structures") or {}
226+
smi = structs.get("canonical_smiles")
227+
if smi:
228+
filtered.append(rec)
229+
return filtered
230+
231+
232+
def get_smiles(rec):
233+
structs = rec.get("molecule_structures") or {}
234+
return structs.get("canonical_smiles")
235+
236+
237+
def has_beta_lactam(smiles, motif):
238+
mol = Chem.MolFromSmiles(smiles)
239+
if mol is None:
240+
return False
241+
return mol.HasSubstructMatch(motif)
242+
243+
244+
def split_by_beta_lactam(molecules):
245+
# SMARTS for β-lactam: 4-membered cyclic amide core
246+
# We use a permissive core: N1C(=O)CC1 to capture substitutions
247+
beta_lactam_smarts = "N1C(=O)CC1"
248+
motif = Chem.MolFromSmarts(beta_lactam_smarts)
249+
if motif is None:
250+
raise RuntimeError("Failed to compile β-lactam SMARTS pattern.")
251+
positives = []
252+
negatives = []
253+
for rec in molecules:
254+
smi = get_smiles(rec)
255+
if not smi:
256+
continue
257+
if has_beta_lactam(smi, motif):
258+
positives.append(rec)
259+
else:
260+
negatives.append(rec)
261+
return positives, negatives
262+
263+
264+
def sample_balanced(positives, negatives, n_pos, n_neg, seed):
265+
random.Random(seed).shuffle(positives)
266+
random.Random(seed + 1).shuffle(negatives)
267+
n_pos_avail = len(positives)
268+
n_neg_avail = len(negatives)
269+
if n_pos_avail < n_pos or n_neg_avail < n_neg:
270+
raise RuntimeError(
271+
f"Not enough molecules to satisfy requested counts. Requested pos={n_pos}, neg={n_neg}, available pos={n_pos_avail}, neg={n_neg_avail}"
272+
)
273+
return positives[: min(n_pos, n_pos_avail)], negatives[: min(n_neg, n_neg_avail)]
274+
275+
276+
def make_df(positives, negatives):
277+
rows = []
278+
for rec in positives:
279+
rows.append(
280+
{
281+
"chembl_id": rec.get("molecule_chembl_id"),
282+
"name": rec.get("pref_name"),
283+
"smiles": get_smiles(rec),
284+
"beta_lactam": 1,
285+
}
286+
)
287+
for rec in negatives:
288+
rows.append(
289+
{
290+
"chembl_id": rec.get("molecule_chembl_id"),
291+
"name": rec.get("pref_name"),
292+
"smiles": get_smiles(rec),
293+
"beta_lactam": 0,
294+
}
295+
)
296+
df = pd.DataFrame(rows, columns=["chembl_id", "name", "smiles", "beta_lactam"])
297+
return df

0 commit comments

Comments
 (0)