From 849c8fc3cc72fdfb7ad7ea6c203c1f9ae773dab3 Mon Sep 17 00:00:00 2001 From: Jingbo He <h570605787@gmail.com> Date: Mon, 9 Sep 2024 04:48:37 +0000 Subject: [PATCH] Update 5 files - /Solution/inference_script.py - /Solution/inference_cli.py - /Solution/inference.py - /Solution/inference_CLI.py - /Solution/Solution.md --- Solution/Solution.md | 43 +++++++++++++++++-- Solution/inference.py | 23 ++++++++++ .../{inference_cli.py => inference_CLI.py} | 0 Solution/inference_script.py | 35 --------------- 4 files changed, 63 insertions(+), 38 deletions(-) create mode 100644 Solution/inference.py rename Solution/{inference_cli.py => inference_CLI.py} (100%) delete mode 100644 Solution/inference_script.py diff --git a/Solution/Solution.md b/Solution/Solution.md index f9eac96..330d538 100644 --- a/Solution/Solution.md +++ b/Solution/Solution.md @@ -1,5 +1,3 @@ -# Solution for tasks - ## 1.Sequence Inference Using DNABERT-2 from Hugging Face to calculate embedding of DNA sequence: AAGTCGTTACGGTACCGTAGCTTACGGCATTA @@ -10,14 +8,51 @@ Using DNABERT-2 from Hugging Face to calculate embedding of DNA sequence: AAGTCG import torch from transformers import BertModel, AutoTokenizer ``` +Can't load `DNABERT-2` model using `AutoModel` class, since `DNABERT-2` is a custom model. -### 1.2 Load tokenizer and model +### 1.2 Load tokenizer and model from Hugging Face ```python tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M") model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") ``` +### 1.3 Define and tokenize DNA sequence + +Define the DNA sequence for inference + +```python +dna_sequence = "AAGTCGTTACGGTACCGTAGCTTACGGCATTA" +``` + +Tokenize the input sequence + +```python +inputs = tokenizer(dna_sequence, return_tensors='pt')["input_ids"] +``` + +### Calculate embedding + +Run the model to get hidden states + +```python +with torch.no_grad(): # Disable gradient calculations for inference + hidden_states = model(inputs)[0] # Shape: [1, sequence_length, 768] +``` + +Pooling to get a single embedding vector + +```python +embedding_mean = torch.mean(hidden_states[0], dim=0) +``` + +Print the resulting embedding + +```python +print("Embedding shape:", embedding_mean.shape) +print("Embedding vector:", embedding_mean) +``` + ## 3. Docker Container Usage Containerize the inference process using Docker. @@ -87,3 +122,5 @@ docker run -it dnabert_inference `-t`: Allocates a pseudo-TTY. +## 4. Evaluation of the Success + diff --git a/Solution/inference.py b/Solution/inference.py new file mode 100644 index 0000000..a4b4b35 --- /dev/null +++ b/Solution/inference.py @@ -0,0 +1,23 @@ +import torch +from transformers import BertModel, AutoTokenizer + +# Load the tokenizer and model from Hugging Face +tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) +model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True) + +# Define the DNA sequence for inference +dna_sequence = "AAGTCGTTACGGTACCGTAGCTTACGGCATTA" + +# Tokenize the input sequence +inputs = tokenizer(dna_sequence, return_tensors='pt')["input_ids"] + +# Run the model to get hidden states +with torch.no_grad(): # Disable gradient calculations for inference + hidden_states = model(inputs)[0] # Shape: [1, sequence_length, 768] + +# Pooling to get a single embedding vector +embedding_mean = torch.mean(hidden_states[0], dim=0) + +# Print the resulting embedding +print("Embedding shape:", embedding_mean.shape) +print("Embedding vector:", embedding_mean) diff --git a/Solution/inference_cli.py b/Solution/inference_CLI.py similarity index 100% rename from Solution/inference_cli.py rename to Solution/inference_CLI.py diff --git a/Solution/inference_script.py b/Solution/inference_script.py deleted file mode 100644 index a5c38da..0000000 --- a/Solution/inference_script.py +++ /dev/null @@ -1,35 +0,0 @@ -import torch -from transformers import BertModel, AutoTokenizer - -# Load model and tokenizer -tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M") -model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M") - -# Define DNA sequence -dna_sequence = "AAGTCGTTACGGTACCGTAGCTTACGGCATTA" - -# Tokenize the sequence -inputs = tokenizer(dna_sequence, return_tensors = 'pt')["input_ids"] -hidden_states = model(inputs)[0] # [1, sequence_length, 768] - -# embedding with mean pooling -embedding_mean = torch.mean(hidden_states[0], dim=0) -print(embedding_mean.shape) # expect to be 768 -print("Mean Embedding vector:", embedding_mean) - -# Define the DNA sequence -dna_sequence = "AAGTCGTTACGGTACCGTAGCTTACGGCATTA" - -# Tokenize the sequence -inputs = tokenizer(dna_sequence, return_tensors='pt')["input_ids"] - -# Run inference -with torch.no_grad(): - hidden_states = model(inputs)[0] - -# Pool the hidden states -embedding_mean = torch.mean(hidden_states[0], dim=0) - -# Print the output -print("Embedding shape:", embedding_mean.shape) -print("Embedding vector:", embedding_mean) \ No newline at end of file -- GitLab