|
1 | 1 | #!/usr/bin/env python3
|
2 | 2 |
|
3 | 3 | #########################################
|
4 |
| -# Authors: [Anan Ibrahim](https://github.com/brianjohnhaas), [Louisa Perelo](https://github.com/louperelo) |
| 4 | +# Authors: [Anan Ibrahim](https://github.com/Darcy220606/AMPcombi), [Louisa Perelo](https://github.com/louperelo) |
5 | 5 | # File: amp_database.py
|
6 | 6 | # Source: https://github.com/Darcy220606/AMPcombi/blob/main/ampcombi/amp_database.py
|
7 |
| -# Source+commit: https://github.com/Darcy220606/AMPcombi/commit/a75bc00c32ecf873a133b18cf01f172ad9cf0d2d/ampcombi/amp_database.py |
8 |
| -# Download Date: 2023-03-08, commit: a75bc00c |
9 | 7 | # This source code is licensed under the MIT license
|
10 | 8 | #########################################
|
11 | 9 |
|
12 |
| -# TITLE: Download the DRAMP database if input db empty AND and make database compatible for diamond |
| 10 | +# TITLE: Download the reference database specified by the user. |
13 | 11 |
|
14 | 12 | import pandas as pd
|
15 | 13 | import requests
|
16 | 14 | import os
|
17 |
| -from datetime import datetime |
| 15 | +import re |
18 | 16 | import subprocess
|
19 |
| -from Bio import SeqIO |
20 |
| -import tempfile |
21 |
| -import shutil |
| 17 | +import argparse |
22 | 18 |
|
| 19 | +from datetime import datetime |
| 20 | +from Bio.Seq import Seq |
| 21 | +from Bio.SeqRecord import SeqRecord |
| 22 | +from Bio import SeqIO |
23 | 23 |
|
24 | 24 | ########################################
|
25 |
| -# FUNCTION: DOWNLOAD DRAMP DATABASE AND CLEAN IT |
| 25 | +# FUNCTION: DOWNLOAD DATABASES AND CLEAN DRAMP and APD |
26 | 26 | #########################################
|
27 |
| -def download_DRAMP(db): |
28 |
| - ##Download the (table) file and store it in a results directory |
29 |
| - url = "http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.xlsx" |
30 |
| - r = requests.get(url, allow_redirects=True) |
31 |
| - with open(db + "/" + "general_amps.xlsx", "wb") as f: |
32 |
| - f.write(r.content) |
33 |
| - ##Convert excel to tab sep file and write it to a file in the DRAMP_db directly with the date its downloaded |
34 |
| - date = datetime.now().strftime("%Y_%m_%d") |
35 |
| - ref_amps = pd.read_excel(db + "/" + r"general_amps.xlsx") |
36 |
| - ref_amps.to_csv(db + "/" + f"general_amps_{date}.tsv", index=None, header=True, sep="\t") |
37 |
| - ##Download the (fasta) file and store it in a results directory |
38 |
| - urlfasta = ( |
39 |
| - "http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.fasta" |
40 |
| - ) |
41 |
| - z = requests.get(urlfasta) |
42 |
| - fasta_path = os.path.join(db + "/" + f"general_amps_{date}.fasta") |
43 |
| - with open(fasta_path, "wb") as f: |
44 |
| - f.write(z.content) |
45 |
| - ##Cleaning step to remove ambigous aminoacids from sequences in the database (e.g. zeros and brackets) |
46 |
| - new_fasta = db + "/" + f"general_amps_{date}_clean.fasta" |
47 |
| - seq_record = SeqIO.parse(open(fasta_path), "fasta") |
48 |
| - with open(new_fasta, "w") as f: |
49 |
| - for record in seq_record: |
50 |
| - id, sequence = record.id, str(record.seq) |
51 |
| - letters = [ |
52 |
| - "A", |
53 |
| - "C", |
54 |
| - "D", |
55 |
| - "E", |
56 |
| - "F", |
57 |
| - "G", |
58 |
| - "H", |
59 |
| - "I", |
60 |
| - "K", |
61 |
| - "L", |
62 |
| - "M", |
63 |
| - "N", |
64 |
| - "P", |
65 |
| - "Q", |
66 |
| - "R", |
67 |
| - "S", |
68 |
| - "T", |
69 |
| - "V", |
70 |
| - "W", |
71 |
| - "Y", |
72 |
| - ] |
73 |
| - new = "".join(i for i in sequence if i in letters) |
74 |
| - f.write(">" + id + "\n" + new + "\n") |
75 |
| - return os.remove(fasta_path), os.remove(db + "/" + r"general_amps.xlsx") |
| 27 | +def download_ref_db(database, threads): |
| 28 | + """ |
| 29 | + Downloads a specified AMP (antimicrobial peptide) reference database based on the |
| 30 | + provided database name and saves it to the specified directory. |
| 31 | + This supports downloading databases only from DRAMP, APD, and UniRef100. |
| 32 | + Parameters: |
| 33 | + ---------- |
| 34 | + db : str |
| 35 | + The directory path where the downloaded database should be saved. |
| 36 | + database : str |
| 37 | + The name of the database to download. Must be one of 'DRAMP', 'APD', or 'UniRef100'. |
| 38 | + threads : int |
| 39 | + Number of threads to use when downloading the UniRef100 database with `mmseqs`. |
| 40 | + """ |
| 41 | + # Check which database was given |
| 42 | + if database == 'DRAMP': |
| 43 | + # Create dir |
| 44 | + db = 'amp_DRAMP_database' |
| 45 | + os.makedirs(db, exist_ok=True) |
| 46 | + # Download the file |
| 47 | + try: |
| 48 | + url = 'http://dramp.cpu-bioinfor.org/downloads/download.php?filename=download_data/DRAMP3.0_new/general_amps.txt' |
| 49 | + response = requests.get(url, allow_redirects=True) |
| 50 | + response.raise_for_status() # Check for any download errors |
| 51 | + date = datetime.now().strftime("%Y_%m_%d") |
| 52 | + with open(db + '/' + f'general_amps_{date}.txt', 'wb') as file: |
| 53 | + file.write(response.content) |
| 54 | + print(f"File downloaded successfully and saved to {db}/general_amps_{date}.txt") |
| 55 | + # Create fasta version and clean it |
| 56 | + db_df = pd.read_csv(f'{db}/general_amps_{date}.txt', sep='\t') |
| 57 | + records = [] |
| 58 | + valid_sequence_pattern = re.compile("^[ACDEFGHIKLMNPQRSTVWY]+$") |
| 59 | + for index, row in db_df.iterrows(): |
| 60 | + sequence = row['Sequence'] |
| 61 | + if valid_sequence_pattern.match(sequence): |
| 62 | + record = SeqRecord(Seq(sequence), id=str(row['DRAMP_ID']), description="") |
| 63 | + records.append(record) |
| 64 | + output_file = f'{db}/general_amps_{date}.fasta' |
| 65 | + SeqIO.write(records, output_file, "fasta") |
| 66 | + except requests.exceptions.RequestException as e: |
| 67 | + print(f"Failed to download DRAMP AMP general database file: {e}") |
| 68 | + return |
| 69 | + |
| 70 | + if database == 'APD': |
| 71 | + # Create dir |
| 72 | + db = 'amp_APD_database' |
| 73 | + os.makedirs(db, exist_ok=True) |
| 74 | + # Download the file |
| 75 | + try: |
| 76 | + url = 'https://aps.unmc.edu/assets/sequences/APD_sequence_release_09142020.fasta' |
| 77 | + response = requests.get(url, allow_redirects=True, verify=False) # Disable SSL verification due to site certificate issue |
| 78 | + response.raise_for_status() |
| 79 | + content = response.text |
| 80 | + print("APD AMP database downloaded successfully.") |
| 81 | + except requests.exceptions.RequestException as e: |
| 82 | + print(f"Failed to download content: {e}") |
| 83 | + return |
| 84 | + # Save the content line-by-line exactly as is |
| 85 | + try: |
| 86 | + with open(db + '/' + 'APD_orig.fasta', 'w') as file: |
| 87 | + file.write(content) |
| 88 | + with open(f'{db}/APD.fasta', 'w') as output_handle: |
| 89 | + valid_sequence_pattern = re.compile("^[ACDEFGHIKLMNPQRSTVWY]+$") |
| 90 | + for record in SeqIO.parse(f'{db}/APD_orig.fasta', "fasta"): |
| 91 | + sequence = str(record.seq) |
| 92 | + if valid_sequence_pattern.match(sequence): |
| 93 | + SeqIO.write(record, output_handle, "fasta") |
| 94 | + os.remove(db + '/' + 'APD_orig.fasta') |
| 95 | + print(f"APD AMP database saved successfully to {db}/APD.fasta") |
| 96 | + # Fasta to table |
| 97 | + headers = [] |
| 98 | + sequences = [] |
| 99 | + seq_ids = [] |
| 100 | + for i, record in enumerate(SeqIO.parse(f'{db}/APD.fasta', "fasta")): |
| 101 | + sequence_id = record.description.split('|')[0] |
| 102 | + headers.append(record.description) |
| 103 | + sequences.append(str(record.seq)) |
| 104 | + seq_ids.append(sequence_id) |
| 105 | + db_df = pd.DataFrame({ |
| 106 | + "APD_ID": seq_ids, |
| 107 | + "APD_Description": headers, |
| 108 | + "APD_Sequence": sequences}) |
| 109 | + db_df.to_csv(f'{db}/APD.txt', sep='\t', index=False, header=True) |
| 110 | + os.remove(db + '/' + 'APD.fasta') |
| 111 | + # Table to fasta |
| 112 | + records = [] |
| 113 | + for index, row in db_df.iterrows(): |
| 114 | + sequence = row['APD_Sequence'] |
| 115 | + record = SeqRecord(Seq(sequence), id=str(row['APD_ID']), description="") |
| 116 | + records.append(record) |
| 117 | + output_file = f'{db}/APD.fasta' |
| 118 | + SeqIO.write(records, output_file, "fasta") |
| 119 | + except Exception as e: |
| 120 | + print(f"Failed to save APD AMP database: {e}") |
| 121 | + |
| 122 | + if database == 'UniRef100': |
| 123 | + # Create dir |
| 124 | + db = 'amp_UniRef100_database' |
| 125 | + os.makedirs(db, exist_ok=True) |
| 126 | + # Download the file |
| 127 | + try: |
| 128 | + os.makedirs(f'{db}/mmseqs2', exist_ok=True) |
| 129 | + command = f"mmseqs databases UniRef100 {db}/mmseqs2/ref_DB {db}/mmseqs2/tmp --remove-tmp-files true --threads {threads} -v 0" |
| 130 | + subprocess.run(command, shell=True, check=True) |
| 131 | + print(f"UniRef100 protein database downloaded successfully and saved to {db}/mmseqs2/UniRef100") |
| 132 | + except subprocess.CalledProcessError as e: |
| 133 | + print(f"Failed to download UniRef100 protein database: {e}") |
76 | 134 |
|
| 135 | +if __name__ == "__main__": |
| 136 | + parser = argparse.ArgumentParser( |
| 137 | + description="Downloads a specified AMP (antimicrobial peptide) reference database based on the provided database name and saves it to the specified directory.") |
| 138 | + parser.add_argument("--database_id", dest="database", type=str, required=True, choices=["DRAMP", "APD", "UniRef100"], |
| 139 | + help="Database ID - one of DRAMP, APD, or UniRef100. This parameter is required.") |
| 140 | + parser.add_argument("--threads", type=int, default=4, |
| 141 | + help="Number of threads supplied to mmseqs databases. Only relevant in the case of 'UniRef100'. Default is 4.") |
77 | 142 |
|
78 |
| -download_DRAMP("amp_ref_database") |
| 143 | + args = parser.parse_args() |
| 144 | + download_ref_db(args.database, args.threads) |
0 commit comments