From 849c8fc3cc72fdfb7ad7ea6c203c1f9ae773dab3 Mon Sep 17 00:00:00 2001
From: Jingbo He <h570605787@gmail.com>
Date: Mon, 9 Sep 2024 04:48:37 +0000
Subject: [PATCH] Update 5 files

- /Solution/inference_script.py
- /Solution/inference_cli.py
- /Solution/inference.py
- /Solution/inference_CLI.py
- /Solution/Solution.md
---
 Solution/Solution.md                          | 43 +++++++++++++++++--
 Solution/inference.py                         | 23 ++++++++++
 .../{inference_cli.py => inference_CLI.py}    |  0
 Solution/inference_script.py                  | 35 ---------------
 4 files changed, 63 insertions(+), 38 deletions(-)
 create mode 100644 Solution/inference.py
 rename Solution/{inference_cli.py => inference_CLI.py} (100%)
 delete mode 100644 Solution/inference_script.py

diff --git a/Solution/Solution.md b/Solution/Solution.md
index f9eac96..330d538 100644
--- a/Solution/Solution.md
+++ b/Solution/Solution.md
@@ -1,5 +1,3 @@
-# Solution for tasks
-
 ## 1.Sequence Inference
 
 Using DNABERT-2 from Hugging Face to calculate embedding of DNA sequence: AAGTCGTTACGGTACCGTAGCTTACGGCATTA
@@ -10,14 +8,51 @@ Using DNABERT-2 from Hugging Face to calculate embedding of DNA sequence: AAGTCG
 import torch
 from transformers import BertModel, AutoTokenizer
 ```
+Can't load `DNABERT-2` model using `AutoModel` class, since `DNABERT-2` is a custom model.
 
-### 1.2 Load tokenizer and model
+### 1.2 Load tokenizer and model from Hugging Face
 
 ```python
 tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M")
 model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M")
 ```
 
+### 1.3 Define and tokenize DNA sequence
+
+Define the DNA sequence for inference
+
+```python
+dna_sequence = "AAGTCGTTACGGTACCGTAGCTTACGGCATTA"
+```
+
+Tokenize the input sequence
+
+```python
+inputs = tokenizer(dna_sequence, return_tensors='pt')["input_ids"]
+```
+
+### Calculate embedding
+
+Run the model to get hidden states
+
+```python
+with torch.no_grad():  # Disable gradient calculations for inference
+    hidden_states = model(inputs)[0]  # Shape: [1, sequence_length, 768]
+```
+
+Pooling to get a single embedding vector
+
+```python
+embedding_mean = torch.mean(hidden_states[0], dim=0)
+```
+
+Print the resulting embedding
+
+```python
+print("Embedding shape:", embedding_mean.shape)
+print("Embedding vector:", embedding_mean)
+```
+
 ## 3. Docker Container Usage
 
 Containerize the inference process using Docker.
@@ -87,3 +122,5 @@ docker run -it dnabert_inference
 
 `-t`: Allocates a pseudo-TTY.
 
+## 4. Evaluation of the Success
+
diff --git a/Solution/inference.py b/Solution/inference.py
new file mode 100644
index 0000000..a4b4b35
--- /dev/null
+++ b/Solution/inference.py
@@ -0,0 +1,23 @@
+import torch
+from transformers import BertModel, AutoTokenizer
+
+# Load the tokenizer and model from Hugging Face
+tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
+model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
+
+# Define the DNA sequence for inference
+dna_sequence = "AAGTCGTTACGGTACCGTAGCTTACGGCATTA"
+
+# Tokenize the input sequence
+inputs = tokenizer(dna_sequence, return_tensors='pt')["input_ids"]
+
+# Run the model to get hidden states
+with torch.no_grad():  # Disable gradient calculations for inference
+    hidden_states = model(inputs)[0]  # Shape: [1, sequence_length, 768]
+
+# Pooling to get a single embedding vector
+embedding_mean = torch.mean(hidden_states[0], dim=0)
+
+# Print the resulting embedding
+print("Embedding shape:", embedding_mean.shape)
+print("Embedding vector:", embedding_mean)
diff --git a/Solution/inference_cli.py b/Solution/inference_CLI.py
similarity index 100%
rename from Solution/inference_cli.py
rename to Solution/inference_CLI.py
diff --git a/Solution/inference_script.py b/Solution/inference_script.py
deleted file mode 100644
index a5c38da..0000000
--- a/Solution/inference_script.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import torch
-from transformers import BertModel, AutoTokenizer
-
-# Load model and tokenizer
-tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M")
-model = BertModel.from_pretrained("zhihan1996/DNABERT-2-117M")
-
-# Define DNA sequence
-dna_sequence = "AAGTCGTTACGGTACCGTAGCTTACGGCATTA"
-
-# Tokenize the sequence
-inputs = tokenizer(dna_sequence, return_tensors = 'pt')["input_ids"]
-hidden_states = model(inputs)[0] # [1, sequence_length, 768]
-
-# embedding with mean pooling
-embedding_mean = torch.mean(hidden_states[0], dim=0)
-print(embedding_mean.shape) # expect to be 768
-print("Mean Embedding vector:", embedding_mean)
-
-# Define the DNA sequence
-dna_sequence = "AAGTCGTTACGGTACCGTAGCTTACGGCATTA"
-
-# Tokenize the sequence
-inputs = tokenizer(dna_sequence, return_tensors='pt')["input_ids"]
-
-# Run inference
-with torch.no_grad():
-    hidden_states = model(inputs)[0]
-
-# Pool the hidden states
-embedding_mean = torch.mean(hidden_states[0], dim=0)
-
-# Print the output
-print("Embedding shape:", embedding_mean.shape)
-print("Embedding vector:", embedding_mean)
\ No newline at end of file
-- 
GitLab