|
以下是使用PyTorch框架对IMDB电影评论数据集进行情感分析的示例代码:
# 引入相关库
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator
# 定义数据处理方式
TEXT = Field(tokenize="spacy", lower=True)
LABEL = LabelField(dtype=torch.float)
# 加载数据集
train_data, test_data = IMDB.splits(TEXT, LABEL)
# 构建词表和数据管道
TEXT.build_vocab(train_data, max_size=10000, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)
# 切分训练集和测试集
train_data, valid_data = train_data.split(split_ratio=0.8, random_state=random.getstate())
# 构建模型
class SentimentAnalyzer(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.dropout(self.embedding(text))
output, (hidden, cell) = self.rnn(embedded)
hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
return self.fc(hidden)
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
model = SentimentAnalyzer(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
# 模型训练和评估
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = criterion.to(device)
def binary_accuracy(preds, y):
rounded_preds = torch.round(torch.sigmoid(preds))
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
text, text_lengths = batch.text
predictions = model(text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
text, text_lengths = batch.text
predictions = model(text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = binary_accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
N_EPOCHS = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), &#39;model.pt&#39;)
print(f&#39;Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%&#39;)
# 加载最优模型,评估测试集准确率
model.load_state_dict(torch.load(&#39;model.pt&#39;))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f&#39;Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%&#39;)以上代码演示了如何使用PyTorch框架对IMDB电影评论数据集进行情感分析。首先,使用torchtext.datasets.IMDB函数加载数据集;然后,使用Field函数和LabelField函数分别定义文本数据和标签的处理方式;接着,构建词表和数据管道,并根据需求调整模型超参数;然后,构建一个LSTM神经网络模型,包含一个嵌入层、一个LSTM层和一个输出层,并使用BCEWithLogitsLoss作为损失函数,Adam作为优化器;最后,使用train函数和evaluate函数分别训练和评估模型。
需要注意的是,以上代码只是一个简单的示例,实际应用中还需要进行更细致的数据清洗和处理,同时也需要根据具体需求调整模型参数和优化模型结构。而且,在使用PyTorch框架时,需要手动编写训练循环和评估循环,并且需要手动管理设备内存和模型参数等,所以相对要比Keras框架更加复杂。 |
|