Skip to content

Commit c8ce54a

Browse files
authored
Merge pull request #427 from nf-core/add_ampcombi_2.0.1
Add ampcombi 2.0.1
2 parents f4e13b3 + 8e1da95 commit c8ce54a

28 files changed

+418
-361
lines changed

CHANGELOG.md

+8
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
### `Added`
99

10+
- [#427](https://github.com/nf-core/funcscan/pull/427) Updated AMPcombi from v0.2.2 to v2.0.1. AMP now can use multiple other databases for classifications. (by @darcy220606)
11+
1012
### `Fixed`
1113

14+
- [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606)
15+
1216
### `Dependencies`
1317

18+
| Tool | Previous version | New version |
19+
| -------- | ---------------- | ----------- |
20+
| AMPcombi | 0.2.2 | 2.0.1 |
21+
1422
### `Deprecated`
1523

1624
## v2.0.0 - [2024-09-05]

bin/ampcombi_download.py

+125-59
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,144 @@
11
#!/usr/bin/env python3
22

33
#########################################
4-
# Authors: [Anan Ibrahim](https://github.com/brianjohnhaas), [Louisa Perelo](https://github.com/louperelo)
4+
# Authors: [Anan Ibrahim](https://github.com/Darcy220606/AMPcombi), [Louisa Perelo](https://github.com/louperelo)
55
# File: amp_database.py
66
# Source: https://github.com/Darcy220606/AMPcombi/blob/main/ampcombi/amp_database.py
7-
# Source+commit: https://github.com/Darcy220606/AMPcombi/commit/a75bc00c32ecf873a133b18cf01f172ad9cf0d2d/ampcombi/amp_database.py
8-
# Download Date: 2023-03-08, commit: a75bc00c
97
# This source code is licensed under the MIT license
108
#########################################
119

12-
# TITLE: Download the DRAMP database if input db empty AND and make database compatible for diamond
10+
# TITLE: Download the reference database specified by the user.
1311

1412
import pandas as pd
1513
import requests
1614
import os
17-
from datetime import datetime
15+
import re
1816
import subprocess
19-
from Bio import SeqIO
20-
import tempfile
21-
import shutil
17+
import argparse
2218

19+
from datetime import datetime
20+
from Bio.Seq import Seq
21+
from Bio.SeqRecord import SeqRecord
22+
from Bio import SeqIO
2323

2424
########################################
25-
# FUNCTION: DOWNLOAD DRAMP DATABASE AND CLEAN IT
25+
# FUNCTION: DOWNLOAD DATABASES AND CLEAN DRAMP and APD
2626
#########################################
27-
def download_DRAMP(db):
28-
##Download the (table) file and store it in a results directory
29-
url = "http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.xlsx"
30-
r = requests.get(url, allow_redirects=True)
31-
with open(db + "/" + "general_amps.xlsx", "wb") as f:
32-
f.write(r.content)
33-
##Convert excel to tab sep file and write it to a file in the DRAMP_db directly with the date its downloaded
34-
date = datetime.now().strftime("%Y_%m_%d")
35-
ref_amps = pd.read_excel(db + "/" + r"general_amps.xlsx")
36-
ref_amps.to_csv(db + "/" + f"general_amps_{date}.tsv", index=None, header=True, sep="\t")
37-
##Download the (fasta) file and store it in a results directory
38-
urlfasta = (
39-
"http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.fasta"
40-
)
41-
z = requests.get(urlfasta)
42-
fasta_path = os.path.join(db + "/" + f"general_amps_{date}.fasta")
43-
with open(fasta_path, "wb") as f:
44-
f.write(z.content)
45-
##Cleaning step to remove ambigous aminoacids from sequences in the database (e.g. zeros and brackets)
46-
new_fasta = db + "/" + f"general_amps_{date}_clean.fasta"
47-
seq_record = SeqIO.parse(open(fasta_path), "fasta")
48-
with open(new_fasta, "w") as f:
49-
for record in seq_record:
50-
id, sequence = record.id, str(record.seq)
51-
letters = [
52-
"A",
53-
"C",
54-
"D",
55-
"E",
56-
"F",
57-
"G",
58-
"H",
59-
"I",
60-
"K",
61-
"L",
62-
"M",
63-
"N",
64-
"P",
65-
"Q",
66-
"R",
67-
"S",
68-
"T",
69-
"V",
70-
"W",
71-
"Y",
72-
]
73-
new = "".join(i for i in sequence if i in letters)
74-
f.write(">" + id + "\n" + new + "\n")
75-
return os.remove(fasta_path), os.remove(db + "/" + r"general_amps.xlsx")
27+
def download_ref_db(database, threads):
28+
"""
29+
Downloads a specified AMP (antimicrobial peptide) reference database based on the
30+
provided database name and saves it to the specified directory.
31+
This supports downloading databases only from DRAMP, APD, and UniRef100.
32+
Parameters:
33+
----------
34+
db : str
35+
The directory path where the downloaded database should be saved.
36+
database : str
37+
The name of the database to download. Must be one of 'DRAMP', 'APD', or 'UniRef100'.
38+
threads : int
39+
Number of threads to use when downloading the UniRef100 database with `mmseqs`.
40+
"""
41+
# Check which database was given
42+
if database == 'DRAMP':
43+
# Create dir
44+
db = 'amp_DRAMP_database'
45+
os.makedirs(db, exist_ok=True)
46+
# Download the file
47+
try:
48+
url = 'http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.txt'
49+
response = requests.get(url, allow_redirects=True)
50+
response.raise_for_status() # Check for any download errors
51+
date = datetime.now().strftime("%Y_%m_%d")
52+
with open(db + '/' + f'general_amps_{date}.txt', 'wb') as file:
53+
file.write(response.content)
54+
print(f"File downloaded successfully and saved to {db}/general_amps_{date}.txt")
55+
# Create fasta version and clean it
56+
db_df = pd.read_csv(f'{db}/general_amps_{date}.txt', sep='\t')
57+
records = []
58+
valid_sequence_pattern = re.compile("^[ACDEFGHIKLMNPQRSTVWY]+$")
59+
for index, row in db_df.iterrows():
60+
sequence = row['Sequence']
61+
if valid_sequence_pattern.match(sequence):
62+
record = SeqRecord(Seq(sequence), id=str(row['DRAMP_ID']), description="")
63+
records.append(record)
64+
output_file = f'{db}/general_amps_{date}.fasta'
65+
SeqIO.write(records, output_file, "fasta")
66+
except requests.exceptions.RequestException as e:
67+
print(f"Failed to download DRAMP AMP general database file: {e}")
68+
return
69+
70+
if database == 'APD':
71+
# Create dir
72+
db = 'amp_APD_database'
73+
os.makedirs(db, exist_ok=True)
74+
# Download the file
75+
try:
76+
url = 'https://aps.unmc.edu/assets/sequences/APD_sequence_release_09142020.fasta'
77+
response = requests.get(url, allow_redirects=True, verify=False) # Disable SSL verification due to site certificate issue
78+
response.raise_for_status()
79+
content = response.text
80+
print("APD AMP database downloaded successfully.")
81+
except requests.exceptions.RequestException as e:
82+
print(f"Failed to download content: {e}")
83+
return
84+
# Save the content line-by-line exactly as is
85+
try:
86+
with open(db + '/' + 'APD_orig.fasta', 'w') as file:
87+
file.write(content)
88+
with open(f'{db}/APD.fasta', 'w') as output_handle:
89+
valid_sequence_pattern = re.compile("^[ACDEFGHIKLMNPQRSTVWY]+$")
90+
for record in SeqIO.parse(f'{db}/APD_orig.fasta', "fasta"):
91+
sequence = str(record.seq)
92+
if valid_sequence_pattern.match(sequence):
93+
SeqIO.write(record, output_handle, "fasta")
94+
os.remove(db + '/' + 'APD_orig.fasta')
95+
print(f"APD AMP database saved successfully to {db}/APD.fasta")
96+
# Fasta to table
97+
headers = []
98+
sequences = []
99+
seq_ids = []
100+
for i, record in enumerate(SeqIO.parse(f'{db}/APD.fasta', "fasta")):
101+
sequence_id = record.description.split('|')[0]
102+
headers.append(record.description)
103+
sequences.append(str(record.seq))
104+
seq_ids.append(sequence_id)
105+
db_df = pd.DataFrame({
106+
"APD_ID": seq_ids,
107+
"APD_Description": headers,
108+
"APD_Sequence": sequences})
109+
db_df.to_csv(f'{db}/APD.txt', sep='\t', index=False, header=True)
110+
os.remove(db + '/' + 'APD.fasta')
111+
# Table to fasta
112+
records = []
113+
for index, row in db_df.iterrows():
114+
sequence = row['APD_Sequence']
115+
record = SeqRecord(Seq(sequence), id=str(row['APD_ID']), description="")
116+
records.append(record)
117+
output_file = f'{db}/APD.fasta'
118+
SeqIO.write(records, output_file, "fasta")
119+
except Exception as e:
120+
print(f"Failed to save APD AMP database: {e}")
121+
122+
if database == 'UniRef100':
123+
# Create dir
124+
db = 'amp_UniRef100_database'
125+
os.makedirs(db, exist_ok=True)
126+
# Download the file
127+
try:
128+
os.makedirs(f'{db}/mmseqs2', exist_ok=True)
129+
command = f"mmseqs databases UniRef100 {db}/mmseqs2/ref_DB {db}/mmseqs2/tmp --remove-tmp-files true --threads {threads} -v 0"
130+
subprocess.run(command, shell=True, check=True)
131+
print(f"UniRef100 protein database downloaded successfully and saved to {db}/mmseqs2/UniRef100")
132+
except subprocess.CalledProcessError as e:
133+
print(f"Failed to download UniRef100 protein database: {e}")
76134

135+
if __name__ == "__main__":
136+
parser = argparse.ArgumentParser(
137+
description="Downloads a specified AMP (antimicrobial peptide) reference database based on the provided database name and saves it to the specified directory.")
138+
parser.add_argument("--database_id", dest="database", type=str, required=True, choices=["DRAMP", "APD", "UniRef100"],
139+
help="Database ID - one of DRAMP, APD, or UniRef100. This parameter is required.")
140+
parser.add_argument("--threads", type=int, default=4,
141+
help="Number of threads supplied to mmseqs databases. Only relevant in the case of 'UniRef100'. Default is 4.")
77142

78-
download_DRAMP("amp_ref_database")
143+
args = parser.parse_args()
144+
download_ref_db(args.database, args.threads)

conf/base.config

+2
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ process {
222222
withName: AMPCOMBI2_PARSETABLES {
223223
memory = { 8.GB * task.attempt }
224224
time = { 2.h * task.attempt }
225+
errorStrategy = { task.exitStatus == 1 ? 'retry' : 'finish' }
226+
maxRetries = 2 // Retry the process up to 2 times
225227
}
226228

227229
withName: AMPCOMBI2_CLUSTER {

conf/modules.config

+2-2
Original file line numberDiff line numberDiff line change
@@ -674,9 +674,9 @@ process {
674674
]
675675
}
676676

677-
withName: DRAMP_DOWNLOAD {
677+
withName: AMP_DATABASE_DOWNLOAD {
678678
publishDir = [
679-
path: { "${params.outdir}/databases/dramp" },
679+
path: { "${params.outdir}/databases/${params.amp_ampcombi_db}" },
680680
mode: params.publish_dir_mode,
681681
enabled: params.save_db,
682682
saveAs: { filename -> filename.equals('versions.yml') ? null : filename }

0 commit comments

Comments
 (0)