This is an automated email from the ASF dual-hosted git repository. lahirujayathilake pushed a commit to branch cybershuttle-dev in repository https://gitbox.apache.org/repos/asf/airavata.git
commit 56f28c3639e1921d246625374dc655375e325f68 Author: yasith <[email protected]> AuthorDate: Sun Mar 16 03:47:37 2025 +0000 add AF2Complex notebook --- .../data/af2complex/AF2Complex_notebook.ipynb | 1021 ++++++++++++++++++++ 1 file changed, 1021 insertions(+) diff --git a/modules/agent-framework/deployments/jupyterhub/data/af2complex/AF2Complex_notebook.ipynb b/modules/agent-framework/deployments/jupyterhub/data/af2complex/AF2Complex_notebook.ipynb new file mode 100644 index 0000000000..5841c5f093 --- /dev/null +++ b/modules/agent-framework/deployments/jupyterhub/data/af2complex/AF2Complex_notebook.ipynb @@ -0,0 +1,1021 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "sl1IbeCnW4zg" + }, + "source": [ + "# Workflow\n", + "\n", + "Step 1: Setup AF2Complex by running through the setup module\n", + "\n", + "Step 2: Pick one of three Target Run modules and follow the steps in each module\n", + "\n", + "Step 3: Download your predictions\n", + "- After running AF2Complex on a target the prediction location will be printed out Such as the screenshot below. \n", + "\n", + "\n", + "from IPython.utils import io\n", + "import os\n", + "import subprocess\n", + "import tqdm.notebook\n", + "from google.colab import output\n", + "import os\n", + "\n", + "output.enable_custom_widget_manager()\n", + "os.environ['TF_FORCE_UNIFIED_MEMORY'] = '1'\n", + "os.environ['XLA_PYTHON_CLIENT_MEM_FRACTION'] = '2.0'\n", + "\n", + "#SOURCE_URL = \"https://storage.googleapis.com/alphafold/alphafold_params_2022-03-02.tar\"\n", + "SOURCE_URL = \"https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar\"\n", + "PARAMS_DIR = '/content/afold/data/params'\n", + "\n", + "PARAMS_PATH = os.path.join(PARAMS_DIR, os.path.basename(SOURCE_URL))\n", + "\n", + "TQDM_BAR_FORMAT = '{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]'\n", + "try:\n", + " with tqdm.notebook.tqdm(total=100, bar_format=TQDM_BAR_FORMAT) as pbar:\n", + " with io.capture_output() as captured:\n", + "\n", + " if not os.path.exists(PARAMS_DIR):\n", + " %shell mkdir --parents \"{PARAMS_DIR}\"\n", + " %shell wget -O \"{PARAMS_PATH}\" \"{SOURCE_URL}\"\n", + " pbar.update(40)\n", + " \n", + " %shell tar --extract --verbose --file=\"{PARAMS_PATH}\" \\\n", + " --directory=\"{PARAMS_DIR}\" --preserve-permissions\n", + " %shell rm \"{PARAMS_PATH}\"\n", + " pbar.update(60)\n", + " else:\n", + " pbar.update(100)\n", + "except subprocess.CalledProcessError:\n", + " print(captured)\n", + " raise\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zo7qqlklKQmd" + }, + "outputs": [], + "source": [ + "#@title 2. Install AF2Complex\n", + "\n", + "#@markdown Please execute this cell by pressing the _Play_ button \n", + "#@markdown \n", + "#@markdown This installs AF2Complex and the python packages it uses\n", + "\n", + "import os\n", + "import subprocess\n", + "\n", + "AF2C_examples = '/content/af2complex/example'\n", + "AF2C_src = '/content/af2complex/src'\n", + "AF_LIB_DIR = os.path.join(AF2C_src, 'alphafold')\n", + "UPLOAD_DIR = '/content/uploaded_feats/'\n", + "os.chdir('/content/')\n", + "\n", + "try:\n", + " with tqdm.notebook.tqdm(total=100, bar_format=TQDM_BAR_FORMAT) as pbar:\n", + " with io.capture_output() as captured:\n", + " if not os.path.exists('/content/af2complex'):\n", + " %shell git clone https://github.com/FreshAirTonight/af2complex.git\n", + " pbar.update(15)\n", + "\n", + " #Install third-party software\n", + " %shell pip uninstall -y tensorflow keras\n", + " pbar.update(5)\n", + " # Install py3dmol.\n", + " %shell pip install py3dmol\n", + " pbar.update(5)\n", + " %shell cd af2complex && pip install -r requirements.txt\n", + " pbar.update(50)\n", + "\n", + " if not os.path.exists('/content/uploaded_feats/'):\n", + " %shell mkdir /content/uploaded_feats/\n", + " pbar.update(25)\n", + "except subprocess.CalledProcessError:\n", + " print(captured)\n", + " raise\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "-dkd1Qcz4254" + }, + "outputs": [], + "source": [ + "#@title 3. Define the configuration of your structure prediction run\n", + "#@markdown **Note**: Please re-run this cell if any variable below is changed\n", + "\n", + "#@markdown Choose preset model configuration: <deepmind> standard settings according to DeepMind, \n", + "#@markdown i.e., 3 recycles and 1 ensemble; \n", + "#@markdown - **economy**: no ensemble, up to 256 MSA clusters, recycling up to 3 rounds; \n", + "#@markdown - **super/super2**: 1 or 2 ensembles, up to 512 MSA clusters, recycling up to 20 rounds; \n", + "#@markdown - **genome/genome2**: 1 or 2 ensembles, up to 512 MSA clusters, max number \n", + "#@markdown of recycles and ensembles adjusted according to input sequence length; \n", + "#@markdown - **expert**: similar to super but maintain the same recycle number regardless target size; \n", + "#@markdown - **casp14**: 8 model ensemblings used by DeepMind in CASP14.')\n", + "import numpy as np\n", + "DATA_DIR = '/content/afold/data/'\n", + "preset = 'economy' #@param ['deepmind', 'casp14', 'economy', 'super', 'expert', 'super2', 'genome', 'genome2']\n", + "\n", + "#@markdown Choose between multimer_v2 or ptm AF parameter sets:\n", + "model_type = 'multimer_v3' #@param ['multimer_v3', 'monomer_ptm']\n", + "model_preset = {\n", + " 'multimer_v3': 'multimer_np',\n", + " 'monomer_ptm': 'monomer_ptm',\n", + " }[model_type]\n", + "if model_type == 'monomer_ptm':\n", + " model_type = 'ptm'\n", + "\n", + "#@markdown There are five different models you can choose from, check the ones you want to run (please check at least one) \n", + "param_set_1 = True #@param {type:\"boolean\"}\n", + "param_set_2 = False #@param {type:\"boolean\"}\n", + "param_set_3 = False #@param {type:\"boolean\"}\n", + "param_set_4 = False #@param {type:\"boolean\"}\n", + "param_set_5 = False #@param {type:\"boolean\"}\n", + "\n", + "param_set_nums = [param_set_1,param_set_2,param_set_3,param_set_4,param_set_5]\n", + "assert np.any(param_set_nums), 'Please check one of the param_sets '\n", + "models = []\n", + "for i, param_set in enumerate(param_set_nums):\n", + " if param_set:\n", + " models.append(f\"model_{i+1}_{model_type}\")\n", + "\n", + "#@markdown Choose your recycling setting:\n", + "#@markdown 0. no recycle info saving \n", + "#@markdown 1. print metrics of intermediate recycles\n", + "#@markdown 2. additionally saving pdb structures of all recycles, \n", + "#@markdown 3. additionally save all results in pickle\n", + "recycling_setting=\"1\" #@param [0, 1, 2, 3]\n", + "\n", + "#@markdown Input below how many predictions (each with a different random seed) will be \n", + "#@markdown generated per model. \n", + "\n", + "#@markdown E.g. if this is 2 and there are 5\n", + "#@markdown models then there will be 10 predictions per input. \n", + "num_predictions_per_model=1 #@param {type:\"integer\"}\n", + "\n", + "#@markdown Input below the maximum number of recycles. Leave as -1 if you don't want to limit the number of recycles.\n", + "max_recycles = 4 #@param {type: \"integer\"}\n", + "\n", + "\n", + "class dotdict(dict):\n", + " \"\"\"dot.notation access to dictionary attributes\"\"\"\n", + " __getattr__ = dict.get\n", + " __setattr__ = dict.__setitem__\n", + " __delattr__ = dict.__delitem__\n", + "def make_default_flags():\n", + " return dotdict({\n", + " 'target_lst_path':None,\n", + " 'output_dir':'/content/af2complex/example/af2c_mod',\n", + " 'feature_dir':'/content/af2complex/example/af2c_fea',\n", + " 'model_names':None,\n", + " 'data_dir':DATA_DIR,\n", + " 'preset':'economy',\n", + " 'random_seed':None,\n", + " 'max_recycles':None,\n", + " 'num_ensemble':None,\n", + " 'max_msa_clusters':None,\n", + " 'max_extra_msa':None,\n", + " 'write_complex_features':False,\n", + " 'no_template':False,\n", + " 'output_pickle':True,\n", + " 'save_recycled':0,\n", + " 'checkpoint_tag':None,\n", + " 'max_mono_msa_depth':10000,\n", + " 'mono_msa_crop_size':5000,\n", + " 'max_template_hits':4,\n", + " 'model_preset':'monomer_ptm',\n", + " 'num_predictions_per_model':1,\n", + " 'msa_pairing':None,\n", + " 'do_cluster_analysis':False,\n", + " 'cluster_edge_thres':10,\n", + " })\n", + "FLAGS = make_default_flags()\n", + "FLAGS['preset'] = preset\n", + "FLAGS['model_preset'] = model_preset\n", + "FLAGS['model_names'] = models\n", + "FLAGS['save_recycled'] = recycling_setting\n", + "FLAGS['num_predictions_per_model'] = num_predictions_per_model\n", + "FLAGS['max_recycles'] = max_recycles\n", + "\n", + "def make_mod_params():\n", + " preset = FLAGS['preset'] \n", + " model_preset = FLAGS['model_preset'] \n", + " models = FLAGS['model_names'] \n", + " recycling_setting = FLAGS['save_recycled'] \n", + " target_lst_file = FLAGS['target_lst_file'] \n", + " msa_pairing = FLAGS['msa_pairing'] \n", + " out_dir = FLAGS['output_dir']\n", + " fea_dir = FLAGS['feature_dir']\n", + " num_predictions_per_model = FLAGS['num_predictions_per_model']\n", + " max_recycles = FLAGS['max_recycles']\n", + "\n", + " parameters = [\n", + " f'--target_lst_path={target_lst_file}',\n", + " f'--data_dir={DATA_DIR}',\n", + " f'--output_dir={out_dir}',\n", + " f'--feature_dir={fea_dir}',\n", + " f'--model_names={\",\".join(models)}',\n", + " f'--preset={preset}',\n", + " f'--model_preset={model_preset}',\n", + " f'--num_predictions_per_model={num_predictions_per_model}',\n", + " f'--save_recycled={recycling_setting}']\n", + " \n", + " if msa_pairing != 'none':\n", + " parameters.append(f'--msa_pairing={msa_pairing}')\n", + " if max_recycles > 0:\n", + " parameters.append(f'--max_recycles={max_recycles}')\n", + "\n", + " return ' '.join(parameters)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "7ZKu8W-rTzJ3" + }, + "outputs": [], + "source": [ + "#@title 4. Define relevant methods for visualization\n", + "\n", + "os.chdir(AF2C_src)\n", + "import py3Dmol\n", + "from alphafold.data.complex import make_complex_features\n", + "from alphafold.model import config\n", + "from alphafold.common import confidence\n", + "from alphafold.data.complex import initialize_template_feats\n", + "\n", + "import alphafold.data.complex as af2c\n", + "from run_af2c_mod import get_asymid2chain_name\n", + "import pickle\n", + "\n", + "import numpy as np\n", + "import re\n", + "import pandas as pd\n", + "from ipywidgets import interact, Dropdown\n", + "from google.colab import widgets\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import ipywidgets\n", + "from IPython.display import display\n", + "import pandas as pd\n", + "\n", + "\n", + "def show_pdb(pred_output_path, show_sidechains=False, show_mainchains=False):\n", + " view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js',)\n", + " view.addModel(open(pred_output_path,'r').read(),'pdb')\n", + " view.setStyle({'cartoon': {'colorscheme': 'chain'}})\n", + " if show_sidechains:\n", + " BB = ['C','O','N']\n", + " view.addStyle({'and':[{'resn':[\"GLY\",\"PRO\"],'invert':True},{'atom':BB,'invert':True}]},\n", + " {'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", + " view.addStyle({'and':[{'resn':\"GLY\"},{'atom':'CA'}]},\n", + " {'sphere':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", + " view.addStyle({'and':[{'resn':\"PRO\"},{'atom':['C','O'],'invert':True}]},\n", + " {'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}}) \n", + " if show_mainchains:\n", + " BB = ['C','O','N','CA']\n", + " view.addStyle({'atom':BB},{'stick':{'colorscheme':f\"WhiteCarbon\",'radius':0.3}})\n", + "\n", + " view.zoomTo()\n", + " return view\n", + "\n", + "def get_asym_id(target, flags):\n", + " \"\"\"Defines the sequence of preprocessing steps to get the asym_id feature\n", + " Args:\n", + " target: dictionary with the items:\n", + " name: name of the multimer,\n", + " split: information about each monomer composing the multimer,\n", + " full: a string denoting all stoichiometry and domains of all monomers\n", + " composing the multimer to be modeled,\n", + " flags: variable containing inference configuration\n", + " Returns:\n", + " asym_id\n", + " \"\"\"\n", + " monomers = af2c.load_monomer_feature(target, flags)\n", + "\n", + " if flags.msa_pairing is not None:\n", + " for i in range(len(monomers)):\n", + " if 'deletion_matrix' in monomers[i]['feature_dict']:\n", + " monomers[i]['feature_dict']['deletion_matrix_int'] = monomers[i]['feature_dict']['deletion_matrix']\n", + " curr_input = {'monomers': monomers, 'target': target, 'flags': flags}\n", + "\n", + " curr_input = af2c.targeted_domain_cropping_mono(curr_input)\n", + " curr_input = af2c.add_asym_id_monomer_ptm(curr_input)\n", + " asym_id = curr_input['asym_id_mono_ptm']\n", + "\n", + " return asym_id\n", + "\n", + "def get_interface_score(\n", + " model_name, target_name, full_name, asym_id, idx2chain_name, out_dir, asym_id_list):\n", + " metric = []\n", + " value = []\n", + " pdb_path = os.path.join(out_dir, target_name, f'{model_name}.pdb')\n", + " pkl_path = os.path.join(out_dir, target_name, f'{model_name}.pkl')\n", + "\n", + " model_config = config.model_config(model_name[:7])\n", + " breaks = np.linspace(\n", + " 0., model_config.model.heads.predicted_aligned_error.max_error_bin,\n", + " model_config.model.heads.predicted_aligned_error.num_bins - 1)\n", + " try:\n", + " result = pickle.load(open(pkl_path, \"rb\"))\n", + " except (EOFError,IOError) as error:\n", + " print(f\"Warning: {target_name} {error} encountered, check the pickle file\")\n", + " raise\n", + "\n", + " super_asym_id, superid2chainids = confidence.join_superchains_asym_id(asym_id, asym_id_list)\n", + "\n", + " res = confidence.interface_score(\n", + " result['aligned_confidence_probs'],\n", + " breaks,\n", + " result['structure_module']['final_atom_positions'],\n", + " result['structure_module']['final_atom_mask'],\n", + " super_asym_id,\n", + " is_probs=True)\n", + "\n", + " ptm = result['ptm'].tolist()\n", + " pitm = result['pitm']['score'].tolist()\n", + "\n", + " inter_sc = res['score'].tolist()\n", + " inter_residues = res['num_residues'].tolist()\n", + " inter_contacts = res['num_contacts'].tolist()\n", + " metric.append('MODEL NAME')\n", + " value.append(model_name)\n", + " metric.append('TARGET CHAINS')\n", + " value.append(full_name)\n", + " metric.append('===========')\n", + " value.append('===========')\n", + " metric.append('pTM-score')\n", + " value.append(ptm)\n", + " metric.append('piTM-score')\n", + " value.append(pitm)\n", + " metric.append('iRes')\n", + " value.append(inter_residues)\n", + " metric.append('iCnt')\n", + " value.append(inter_contacts)\n", + " metric.append('interface-score')\n", + " value.append(inter_sc)\n", + "\n", + " if FLAGS.do_cluster_analysis:\n", + " clus_res = confidence.cluster_analysis(\n", + " super_asym_id,\n", + " result['structure_module']['final_atom_positions'],\n", + " result['structure_module']['final_atom_mask'],\n", + " edge_contacts_thres=FLAGS.cluster_edge_thres,\n", + " superid2chainids=superid2chainids,\n", + " )\n", + " cluster_identities = []\n", + " for cluster in clus_res['clusters']:\n", + " cluster_identities.append([idx2chain_name[c] for c in cluster])\n", + "\n", + " metric.append('num_clusters')\n", + " value.append(clus_res['num_clusters'])\n", + " metric.append('cluster_sizes')\n", + " value.append(clus_res['cluster_size'])\n", + " metric.append('clusters')\n", + " value.append(cluster_identities)\n", + " return pd.DataFrame({'Metric Name':metric, 'Value':value})\n", + "\n", + "DATA_DIR = '/content/afold/data' \n", + "def display_metrics(target_lst_path, model_path, support, show_sidechains_=True, show_mainchains_=True, ):\n", + " with io.capture_output() as captured:\n", + " target_lst = af2c.read_af2c_target_file(target_lst_path)\n", + " full_name = support[\"full_name\"] \n", + " target_name= support[\"target_name\"] \n", + " idx2chain_name= support[\"idx2chain_name\"]\n", + " asym_id_list= support[\"asym_id_list\"]\n", + " asym_id= support[\"asym_id\"] \n", + " model_name = os.path.basename(model_path)\n", + " pdb_path = os.path.join(FLAGS.output_dir, target_name, f'{model_name}.pdb')\n", + "\n", + " metrics = get_interface_score(\n", + " model_name, target_name, full_name, asym_id, idx2chain_name, FLAGS.output_dir, asym_id_list\n", + " )\n", + "\n", + " print(metrics.to_markdown())\n", + " view = show_pdb(pdb_path, \n", + " show_sidechains=show_sidechains_,\n", + " show_mainchains=show_mainchains_)\n", + " view.show()\n", + "\n", + "def visualize(show_sidechains, show_mainchains):\n", + " is_pdb = lambda x: '.pdb' in x \n", + " if not os.path.exists(FLAGS.target_lst_file):\n", + " raise f'{FLAGS.target_lst_file} does not exist!'\n", + " target_lst = af2c.read_af2c_target_file(FLAGS.target_lst_file)\n", + " files = []\n", + " model2support = {}\n", + " for target in target_lst:\n", + " target_name = target['name']\n", + " target_name = re.sub(\":\", \"_x\", target_name)\n", + " target_name = re.sub(\"/\", \"+\", target_name)\n", + " target_dir = os.path.join(FLAGS.output_dir, target_name)\n", + " if not os.path.exists(target_dir):\n", + " raise Exception(\n", + " f'No predictions for {target_name}. Predictions available are for {os.listdir(FLAGS.output_dir)}. Please make sure the inference cell was run correctly.'\n", + " )\n", + " target_files = os.listdir(target_dir)\n", + " if len(target_files) == 0:\n", + " raise Exception(\n", + " f'No predictions for {target_name}. Predictions available are for {os.listdir(FLAGS.output_dir)}. Please make sure the inference cell was run correctly.'\n", + " )\n", + " for f in target_files:\n", + " full_name = target['full']\n", + " idx2chain_name = get_asymid2chain_name(target)\n", + " asym_id_list = target['asym_id_list']\n", + " if not FLAGS.write_complex_features:\n", + " with io.capture_output() as captured:\n", + " asym_id = get_asym_id(target, FLAGS)\n", + " else:\n", + " feat_path = os.path.join(target_name, 'features_comp.pkl')\n", + " try:\n", + " feature_dict = np.load(open(feat_path, 'rb'))\n", + " except FileNotFoundError:\n", + " print('Did not find feature_comp.pkl file. ',\n", + " 'To rebuild complex features, run without ',\n", + " '--write_complex_features flag.')\n", + " asym_id = feature_dict['asym_id']\n", + " if is_pdb(f):\n", + " model_name = os.path.join(target_dir, target_name, f)[:-4]\n", + " files.append(model_name)\n", + " model2support[model_name] = {\n", + " 'full_name': full_name, \n", + " 'target_name': target_name, \n", + " 'idx2chain_name': idx2chain_name, \n", + " 'asym_id_list': asym_id_list, \n", + " 'asym_id': asym_id, \n", + " }\n", + "\n", + " tabs = widgets.TabBar(files)\n", + "\n", + " for i, model in enumerate(files): \n", + " with tabs.output_to(i):\n", + " display_metrics(\n", + " FLAGS.target_lst_file,\n", + " model,\n", + " model2support[model],\n", + " show_sidechains,\n", + " show_mainchains, \n", + " )\n", + "\n", + "def get_dataset_desc(file_path):\n", + " from google.colab import data_table\n", + " data_table.enable_dataframe_formatter()\n", + " with open(file_path, 'r') as f:\n", + " ecoli_txt = f.readlines()\n", + "\n", + " ids = []\n", + " genes = []\n", + " acs = []\n", + " fulllen = []\n", + " ranges = []\n", + " length = []\n", + " desc = []\n", + " for line in ecoli_txt:\n", + " if line.startswith('#'):\n", + " continue\n", + " line = line.split('\\t')\n", + " ids.append(line[0])\n", + " genes.append(line[1])\n", + " acs.append(line[2])\n", + " fulllen.append(line[3])\n", + " ranges.append(line[4])\n", + " length.append(line[5])\n", + " desc.append(line[6])\n", + "\n", + " df = pd.DataFrame({\n", + " 'ID': ids[1:],\n", + " 'Gene': genes[1:],\n", + " 'AC': acs[1:],\n", + " 'Full Length': fulllen[1:],\n", + " 'Range': ranges[1:],\n", + " 'Length': length[1:],\n", + " 'Description': desc[1:],\n", + " })\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EUsieqM-OsSJ" + }, + "source": [ + "# Target Run (AF2Complex Examples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "1QFJ32_zOqJU" + }, + "outputs": [], + "source": [ + "import subprocess\n", + "import numpy as np\n", + "os.chdir(AF_LIB_DIR)\n", + "#@markdown #1. Choose one of the AF2Complex examples to run below! \n", + "#@markdown Note: After choosing your parameters below, press the play button to run the example chosen:\n", + "FLAGS['feature_dir'] = '/content/af2complex/example/af2c_fea' \n", + "FLAGS['output_dir'] = '/content/af2complex/example/af2c_mod'\n", + "\n", + "example = 'H1065' #@param ['H1065', 'H1072', 'H1072_H1065', 'H1060v4']\n", + "\n", + "target_lst_file = {\n", + " 'H1065': '/content/af2complex/example/targets/example1.lst',\n", + " 'H1072': '/content/af2complex/example/targets/example2.lst',\n", + " 'H1072_H1065': '/content/af2complex/example/targets/example3.lst',\n", + " 'H1060v4': '/content/af2complex/example/targets/example4.lst',\n", + "}[example]\n", + "\n", + "#@markdown Choose the type of msa pairing you want to use (Note: 'none' will do no msa_pairing, 'all' will do every possible species pairing as was done in AF-Multimer):\n", + "msa_pairing = 'none' #@param ['none', 'all', 'custom', 'cyclic', 'linear']\n", + "\n", + "FLAGS['target_lst_file'] = target_lst_file\n", + "FLAGS['msa_pairing'] = msa_pairing\n", + "\n", + "pred_params = make_mod_params()\n", + "\n", + "# with io.capture_output() as captured:\n", + "%shell python -u ../run_af2c_mod.py {pred_params}\n", + "print(f'DONE! (predictions available on {FLAGS.output_dir}' )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "sd1bBQgTJLEA" + }, + "outputs": [], + "source": [ + "# %matplotlib inline\n", + "\n", + "#@markdown #2. Visualize your results below by pressing the *Play* button on the left\n", + "#@markdown Choose one of the AF2Complex examples to visualize below! \n", + "FLAGS['feature_dir'] = '/content/af2complex/example/af2c_fea' \n", + "FLAGS['output_dir'] = '/content/af2complex/example/af2c_mod'\n", + "\n", + "example = 'H1065' #@param ['H1065', 'H1072', 'H1072_H1065', 'H1060v4']\n", + "\n", + "target_lst_file = {\n", + " 'H1065': '/content/af2complex/example/targets/example1.lst',\n", + " 'H1072': '/content/af2complex/example/targets/example2.lst',\n", + " 'H1072_H1065': '/content/af2complex/example/targets/example3.lst',\n", + " 'H1060v4': '/content/af2complex/example/targets/example4.lst',\n", + "}[example]\n", + "\n", + "FLAGS['target_lst_file'] = target_lst_file\n", + "FLAGS['msa_pairing'] = msa_pairing\n", + "\n", + "pred_params = make_mod_params()\n", + "\n", + "show_sidechains = False #@param {type: 'boolean'}\n", + "show_mainchains = False #@param {type: 'boolean'}\n", + "\n", + "\n", + "AF2C_examples = '/content/af2complex/example'\n", + "AF2C_egtargets = os.path.join(AF2C_examples, 'targets')\n", + "\n", + "visualize(show_sidechains, show_mainchains)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1s0RID2AYdpk" + }, + "source": [ + "# Target Run (within the *E. coli* by using pre-generated features for the proteome!)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ClAAecbPYrr0" + }, + "outputs": [], + "source": [ + "import os\n", + "#@title 1. Download the dataset from [Zenodo](https://zenodo.org/record/7008599#.YwFWR3bMJaQ)\n", + "#@markdown Note: Usually takes less than 20 minutes.\n", + "\n", + "AF2C_examples = '/content/af2complex/example'\n", + "os.chdir(AF2C_examples)\n", + "AF2C_ecoli = os.path.join(AF2C_examples, 'ecoli')\n", + "zenodo_link = 'https://zenodo.org/record/7008599/files/af2c_fea_ecoli_220331_msa10ktem10.tar?download=1'\n", + "\n", + "AF2C_ecoli_path = os.path.join(AF2C_ecoli, os.path.basename(zenodo_link))\n", + "\n", + "if not os.path.exists(AF2C_ecoli):\n", + " os.mkdir(AF2C_ecoli)\n", + "\n", + "%shell wget -O {AF2C_ecoli_path} {zenodo_link}\n", + " \n", + "with io.capture_output() as captured:\n", + " %shell tar --extract --verbose --file={AF2C_ecoli_path} \\\n", + " --directory={AF2C_ecoli} --preserve-permissions\n", + " \n", + "AF2C_ecoli_feas = AF2C_ecoli_path.split('.tar')[0]\n", + "txt_zenodo_link = \"https://zenodo.org/record/7008599/files/ecoli_af2c_fea.txt?download=1\"\n", + "AF2C_ecoli_txt_path = os.path.join(AF2C_ecoli, os.path.basename(txt_zenodo_link))\n", + "\n", + "%shell wget -O {AF2C_ecoli_txt_path} {txt_zenodo_link}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "IQuqyTcMMaS7" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# @markdown # Find the genes in *E. coli* with pre-generated input features:\n", + "\n", + "ecoli_df = get_dataset_desc(AF2C_ecoli_txt_path)\n", + "ecoli_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "yNYa7QFWQPZF" + }, + "outputs": [], + "source": [ + "import subprocess\n", + "import numpy as np\n", + "# from run_af2c_mod import FLAGS\n", + "os.chdir(AF_LIB_DIR)\n", + "# print(FLAGS.fea_dir, AF2C_ecoli_feas)\n", + "FLAGS.feature_dir = AF2C_ecoli_feas\n", + "# print(FLAGS.fea_dir, AF2C_ecoli_feas)\n", + "FLAGS.output_dir = '/content/af2complex/example/ecoli/af2c_mod'\n", + "if not os.path.exists(FLAGS.output_dir):\n", + " os.mkdir(FLAGS.output_dir)\n", + "\n", + "#@markdown #2. Define your protein complex target using the UniProt IDs you found in the table above, then run AF2Complex using the *Play* button on the left\n", + "#@markdown Define how the chains compose the target,\n", + "#@markdown e.g.: \n", + "# #@markdown - T1065s1/T1065s2 *(Explanation on [example1](https://github.gatech.edu/gmu3/af2complex/tree/master/example#example-1) for more)*\n", + "# #@markdown - T1072s1:2/T1072s2:2 *(Explanation on [example2](https://github.gatech.edu/gmu3/af2complex/tree/master/example#example-2) for more)\n", + "# #@markdown - T1065s1/T1065s2+T1072s1:2/T1072s2:2 *(Explanation on [example3](https://github.gatech.edu/gmu3/af2complex/tree/master/example#example-3) for more)*\n", + "# #@markdown - T1060s3:12 *(Explanation on [example4](https://github.gatech.edu/gmu3/af2complex/tree/master/example#example-4) for more)*\n", + "\n", + "#@markdown - SECE/SECG/SECY *(SecYEG translocon, a hetero-trimer composed of SecE, SecG, and SecY, 680 AAs)*\n", + "#@markdown - PPID|265-359/DSBA|20-208 *(PpiD parvulin domain and DsbA, each has a residue ID range, 285 AAs)*\n", + "#@markdown - SURA|21-428/BAMA|21-420 *(surA and BamA, both have signal peptide removed, 808 AAs)*\n", + "#@markdown - PPID/YFGM *(chaperon proteins PpiD and YfgM, 829 AAs)*\n", + "#@markdown - CCMA:2/CCMB:2/CCMC/CCMD/CCME *(CcmI system, 1327 AAs)*\n", + "#@markdown - YAJC:3 *(YajC, a membrane protein chaperon? 330 AAs)*\n", + "\n", + "#@markdown Note that a large target may require resources beyond the free-tier.\n", + "\n", + "# chains = 'e.g. T1065s1/T1065s2' #@param {type:'string'}\n", + "chains = 'SECE/SECG/SECY' #@param {type:'string'}\n", + "\n", + "#@markdown Name your target\n", + "# target = 'e.g. H1065' #@param {type: 'string'}\n", + "target = 'SecYEG' #@param {type: 'string'}\n", + "\n", + "#@markdown Put down the total number of AA of the target (does not need to be exact as this number will be parsed but not used in the code)\n", + "num_AA = 680 #@param{type: 'integer'}\n", + "\n", + "#@markdown Choose the type of msa pairing you want to use (Note: 'none' will do no msa_pairing, 'all' will do species pairing as in AF-Multimer):\n", + "msa_pairing = 'all' #@param ['none', 'all', 'custom', 'cyclic', 'linear']\n", + "FLAGS.msa_pairing = msa_pairing\n", + "\n", + "target_lst = f'{chains} {num_AA} {target}'\n", + "\n", + "target_lst_file = os.path.join(AF2C_ecoli, f'{target}.lst')\n", + "with open(target_lst_file, 'w') as f:\n", + " f.write(target_lst)\n", + " f.close()\n", + "FLAGS.target_lst_file = target_lst_file\n", + "\n", + "pred_params = make_mod_params()\n", + "\n", + "# with io.capture_output() as captured:\n", + "%shell python -u ../run_af2c_mod.py {pred_params}\n", + "print(f'DONE! (predictions available on {FLAGS.output_dir}' )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "MewHtxVmBxNS" + }, + "outputs": [], + "source": [ + "#@markdown # Press **Play** button to the left to see which targets you have predictions for so far\n", + "from ipywidgets import interact\n", + "pd.DataFrame({\n", + " 'Target Name': os.listdir(FLAGS.output_dir),\n", + " 'Number of Predictions': [\n", + " len(list(filter(lambda x: '.pdb' in x, os.listdir(os.path.join(FLAGS.output_dir,f )))))\n", + " for f in os.listdir(FLAGS.output_dir)],\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "9G2i6I84ZJql" + }, + "outputs": [], + "source": [ + "#@markdown #3. Visualize your results below by pressing the *Play* button on the left\n", + "\n", + "#@markdown Check out the cell above to see which targets have predictions. Imput the target name below to visualize the proteins. \n", + "target_name = 'SecYEG' #@param {type: 'string'}\n", + "FLAGS.target_lst_file = os.path.join(AF2C_ecoli, f'{target_name}.lst')\n", + "if not os.path.exists(FLAGS.target_lst_file):\n", + " raise Exception(f' Target: predictions for {target_name} do not exist, run the cell above to see which targets have predictions')\n", + "\n", + "show_sidechains = False #@param {type: 'boolean'}\n", + "show_mainchains = False #@param {type: 'boolean'}\n", + "\n", + "visualize(show_sidechains, show_mainchains)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tct8Uw1TW8RQ" + }, + "source": [ + "#Target Run (Upload your own features)\n", + "\n", + "First create a folder with your features with the following file structure: \n", + "```\n", + "dataset_name\n", + "│ \n", + "└───chain_1\n", + "│ │ \n", + "│ └───features.pkl\n", + "└───chain_2\n", + "│ │ \n", + "│ └───features.pkl\n", + "...\n", + "```\n", + "Then, upload a .tar (or .tgz) file of this folder below (Section 1a). You can create a .tar file with the following unix terminal command (Please keep the folder name and the .tar file name the same): \n", + "\n", + "\n", + "```\n", + "tar -czf af2c_fea.tgz af2c_fea\n", + "```\n", + "\n", + "\n", + "Optionally, you can also upload a txt file describing the dataset to easily search through the dataset. It should look like the following (all tab separated):\n", + "```\n", + "### ID -- UniProt ID\n", + "### Gene -- Recommended gene name\n", + "### AC -- Accession ID\n", + "### Fulllen -- Full sequence length\n", + "### Range -- Residue range of the longest mature chain\n", + "### Len -- Seuence length\n", + "### Description -- description of the gene\n", + "ID\tGene\tAC\tFullLen\tRange\tLen\tDescription\n", + "3MG1\ttag\tP05100\t187\t1-187\t187\tDNA-3-methyladenine glycosylase 1\n", + "...\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "nnj1Vwg9KW3A" + }, + "outputs": [], + "source": [ + "#@markdown #1a. Upload your dataset here (press *Play* button to the left)\n", + "#@markdown Please upload only one dataset at a time\n", + "\n", + "from google.colab import files\n", + "os.chdir(UPLOAD_DIR)\n", + "print('Upload the .tar (or .tgz) file')\n", + "\n", + "uploaded = files.upload()\n", + "dset_file = list(uploaded.keys())[0]\n", + "dset_name = dset_file.split('.')[0]\n", + "dset_dir = os.path.join(UPLOAD_DIR, dset_name)\n", + "print(f\"INFO: Uploaded dataset with name: {dset_name}\")\n", + " \n", + "with io.capture_output() as captured:\n", + " %shell tar --extract --verbose --file=/content/uploaded_feats/{dset_file} \\\n", + " --directory={UPLOAD_DIR} --preserve-permissions\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "XgDHn7W2vEWX" + }, + "outputs": [], + "source": [ + "#@markdown #1b. Check out the dataset (Optional)\n", + "#@markdown Upload the dataset description file\n", + "\n", + "os.chdir(UPLOAD_DIR)\n", + "print('Upload the .txt description file')\n", + "uploaded = files.upload()\n", + "desc_file = list(uploaded.keys())[0]\n", + "dset_df = get_dataset_desc(os.path.join(UPLOAD_DIR, desc_file))\n", + "dset_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "76o6KEjsy6R7" + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "# from run_af2c_mod import FLAGS\n", + "os.chdir(AF_LIB_DIR)\n", + "# print(FLAGS.fea_dir, AF2C_ecoli_feas)\n", + "FLAGS.feature_dir = os.path.join(UPLOAD_DIR, dset_name)\n", + "# print(FLAGS.fea_dir, AF2C_ecoli_feas)\n", + "FLAGS.output_dir = os.path.join(UPLOAD_DIR, dset_name, 'af2c_mod')\n", + "if not os.path.exists(FLAGS.output_dir):\n", + " os.mkdir(FLAGS.output_dir)\n", + "\n", + "#@markdown #2. Define your protein complex target and Run AF2Complex on it using the *Play* button on the left\n", + "#@markdown Define how the chains compose the target, look at sections above for more information (Section 2 of *E. coli* target run)\n", + "\n", + "#@markdown Note that a large target may require resources beyond the free-tier.\n", + "\n", + "# chains = 'e.g. T1065s1/T1065s2' #@param {type:'string'}\n", + "chains = 'HgcA/HgcB' #@param {type:'string'}\n", + "\n", + "#@markdown Name your target\n", + "# target = 'e.g. H1065' #@param {type: 'string'}\n", + "target = 'HgcAB' #@param {type: 'string'}\n", + "\n", + "#@markdown Put down the total number of AA of the target (does not need to be exact as this number will be parsed but not used in the code)\n", + "num_AA = 433 #@param{type: 'integer'}\n", + "\n", + "#@markdown Choose the type of msa pairing you want to use (Note: 'none' will do no msa_pairing, 'all' will do species pairing as in AF-Multimer):\n", + "msa_pairing = 'all' #@param ['none', 'all', 'custom', 'cyclic', 'linear']\n", + "FLAGS.msa_pairing = msa_pairing\n", + "\n", + "target_lst = f'{chains} {num_AA} {target}'\n", + "\n", + "target_lst_file = os.path.join(dset_dir, f'{target}.lst')\n", + "with open(target_lst_file, 'w') as f:\n", + " f.write(target_lst)\n", + " f.close()\n", + "FLAGS.target_lst_file = target_lst_file\n", + "\n", + "pred_params = make_mod_params()\n", + "\n", + "# with io.capture_output() as captured:\n", + "%shell python -u ../run_af2c_mod.py {pred_params}\n", + "print(f'DONE! (predictions available on {FLAGS.output_dir}' )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "01DWp6Sjk6pP" + }, + "outputs": [], + "source": [ + "#@markdown # Press **Play** button to the left to see which targets you have predictions for so far\n", + "from ipywidgets import interact\n", + "pd.DataFrame({\n", + " 'Target Name': os.listdir(FLAGS.output_dir),\n", + " 'Number of Predictions': [\n", + " len(list(filter(lambda x: '.pdb' in x, os.listdir(os.path.join(FLAGS.output_dir,f )))))\n", + " for f in os.listdir(FLAGS.output_dir)],\n", + "})\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "1rjkGMLYyVcU" + }, + "outputs": [], + "source": [ + "# %matplotlib inline\n", + "\n", + "#@markdown #3. Visualize your results below by pressing the *Play* button on the left\n", + "\n", + "#@markdown Place the target name you want to visualize below\n", + "# target = 'e.g. H1065' #@param {type: 'string'}\n", + "target_name = 'HgcAB' #@param {type: 'string'}\n", + "FLAGS.target_lst_file = os.path.join(dset_dir, f'{target_name}.lst')\n", + "if not os.path.exists(FLAGS.target_lst_file):\n", + " raise Exception(f' Target: predictions for {target_name} do not exist, run the cell above to see which targets have predictions')\n", + "\n", + "show_sidechains = False #@param {type: 'boolean'}\n", + "show_mainchains = False #@param {type: 'boolean'}\n", + "\n", + "visualize(show_sidechains, show_mainchains)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7PN-BhHoTtCr" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "private_outputs": true, + "provenance": [], + "toc_visible": true + }, + "gpuClass": "standard", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]" + }, + "vscode": { + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}
