使用示例

openclaw AI使用帮助 2

我来帮你实现一个本地运行的零样本文本分类工具,无需网络连接,这里基于BERT-like模型实现OpenClaw的核心思想。

使用示例-第1张图片-AI小龙虾下载官网 - openclaw下载 - openclaw小龙虾

基于transformers的本地零样本分类

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import numpy as np
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')
class LocalOpenClaw:
    """
    本地运行的零样本文本分类器
    基于预训练语言模型,无需网络连接
    """
    def __init__(self, model_name: str = "bert-base-uncased"):
        """
        初始化模型
        默认使用bert-base-uncased,也可以使用其他本地模型
        """
        print(f"加载模型: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.eval()
    def create_hypothesis(self, text: str, label: str) -> str:
        """
        创建假设文本(零样本分类的核心)
        可以根据任务调整模板
        """
        templates = [
            "This text is about {}.",
            "The topic of this text is {}.",
            "This is related to {}.",
            "The main subject is {}."
        ]
        import random
        template = random.choice(templates)
        return template.format(label)
    def predict(self, 
               text: str, 
               candidate_labels: List[str],
               multi_label: bool = False,
               hypothesis_template: str = None) -> Dict:
        """
        零样本分类预测
        参数:
        - text: 要分类的文本
        - candidate_labels: 候选标签列表
        - multi_label: 是否多标签分类
        - hypothesis_template: 自定义假设模板
        返回:
        - 预测结果字典
        """
        results = []
        with torch.no_grad():
            for label in candidate_labels:
                # 构建假设文本
                if hypothesis_template:
                    hypothesis = hypothesis_template.format(label)
                else:
                    hypothesis = self.create_hypothesis(text, label)
                # 编码文本和假设
                inputs = self.tokenizer(
                    text,
                    hypothesis,
                    return_tensors="pt",
                    truncation=True,
                    padding=True,
                    max_length=512
                )
                # 前向传播
                outputs = self.model(**inputs)
                logits = outputs.logits
                # 获取概率(使用softmax)
                probs = torch.softmax(logits, dim=-1)
                # 假设的得分(使用"蕴含"的概率)
                # 对于大多数模型,索引0是矛盾,1是中性,2是蕴含
                # 这里简化处理,使用最后一个logit作为得分
                score = probs[0, -1].item()
                results.append({
                    "label": label,
                    "score": score
                })
        # 排序结果
        results.sort(key=lambda x: x["score"], reverse=True)
        if not multi_label:
            # 单标签:返回最高分的标签
            best_result = results[0]
            return {
                "sequence": text,
                "labels": [r["label"] for r in results],
                "scores": [r["score"] for r in results],
                "prediction": best_result["label"],
                "confidence": best_result["score"]
            }
        else:
            # 多标签:返回所有超过阈值的标签
            threshold = 0.5  # 可调整的阈值
            predictions = [r["label"] for r in results if r["score"] > threshold]
            return {
                "sequence": text,
                "labels": [r["label"] for r in results],
                "scores": [r["score"] for r in results],
                "predictions": predictions
            }
    def batch_predict(self, 
                     texts: List[str], 
                     candidate_labels: List[str]) -> List[Dict]:
        """批量预测"""
        return [self.predict(text, candidate_labels) for text in texts]
if __name__ == "__main__":
    # 初始化分类器(首次运行会下载模型,之后使用本地缓存)
    classifier = LocalOpenClaw()
    # 测试文本
    text = "The stock market reached new highs today after positive economic indicators were released."
    # 候选标签
    candidate_labels = ["finance", "sports", "politics", "technology", "entertainment"]
    # 预测
    result = classifier.predict(text, candidate_labels)
    print("文本:", text)
    print("\n预测结果:")
    print(f"  预测类别: {result['prediction']}")
    print(f"  置信度: {result['confidence']:.3f}")
    print("\n所有候选标签得分:")
    for label, score in zip(result["labels"], result["scores"]):
        print(f"  {label}: {score:.3f}")

使用更轻量级的模型(推荐)

from sentence_transformers import SentenceTransformer, util
import numpy as np
class LightweightOpenClaw:
    """
    轻量级零样本分类器
    使用sentence-transformers,模型更小更快
    """
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        """
        初始化
        model_name可以是:
        - 'all-MiniLM-L6-v2' (推荐,22MB)
        - 'paraphrase-MiniLM-L3-v2' (更小)
        - 'all-mpnet-base-v2' (更大但更准确)
        """
        print(f"加载轻量模型: {model_name}")
        self.model = SentenceTransformer(model_name)
    def predict(self, 
               text: str, 
               candidate_labels: List[str],
               multi_label: bool = False,
               threshold: float = 0.5) -> Dict:
        """
        基于语义相似度的零样本分类
        原理:
        1. 将文本和候选标签编码为向量
        2. 计算余弦相似度
        3. 相似度最高的作为预测结果
        """
        # 准备标签描述(可以自定义)
        label_descriptions = [f"This text is about {label}." for label in candidate_labels]
        # 编码文本和标签
        text_embedding = self.model.encode(text, convert_to_tensor=True)
        label_embeddings = self.model.encode(label_descriptions, convert_to_tensor=True)
        # 计算相似度
        cos_scores = util.cos_sim(text_embedding, label_embeddings)[0]
        # 转换为numpy
        scores = cos_scores.cpu().numpy()
        # 获取排序
        sorted_indices = np.argsort(scores)[::-1]
        results = []
        for idx in sorted_indices:
            results.append({
                "label": candidate_labels[idx],
                "score": float(scores[idx])
            })
        if not multi_label:
            # 单标签分类
            best_result = results[0]
            return {
                "sequence": text,
                "labels": [r["label"] for r in results],
                "scores": [r["score"] for r in results],
                "prediction": best_result["label"],
                "confidence": best_result["score"]
            }
        else:
            # 多标签分类
            predictions = [r["label"] for r in results if r["score"] > threshold]
            return {
                "sequence": text,
                "labels": [r["label"] for r in results],
                "scores": [r["score"] for r in results],
                "predictions": predictions
            }
if __name__ == "__main__":
    # 创建分类器
    classifier = LightweightOpenClaw()
    # 测试数据
    texts = [
        "Apple unveiled its new iPhone with advanced camera features.",
        "The football team won the championship after an intense game.",
        "The government announced new economic policies to combat inflation."
    ]
    labels = ["technology", "sports", "politics", "finance", "entertainment"]
    print("零样本文本分类示例:\n")
    for i, text in enumerate(texts, 1):
        result = classifier.predict(text, labels)
        print(f"文本{i}: {text}")
        print(f"预测: {result['prediction']} (置信度: {result['confidence']:.3f})")
        print("-" * 50)

简单实用的版本(无深度学习)

import re
from collections import Counter
import math
class SimpleOpenClaw:
    """
    基于关键词匹配的简单零样本分类器
    完全无依赖,适合基础应用
    """
    def __init__(self):
        # 预定义的关键词词典(可以根据需要扩展)
        self.keyword_dict = {
            "technology": ["computer", "software", "hardware", "phone", "internet", 
                          "digital", "tech", "app", "program", "code", "ai", "robot"],
            "sports": ["game", "team", "player", "score", "win", "lose", 
                      "championship", "football", "basketball", "sport", "athlete"],
            "finance": ["stock", "market", "money", "bank", "investment", 
                       "price", "economy", "financial", "trade", "currency"],
            "politics": ["government", "president", "policy", "election", "law",
                        "political", "minister", "vote", "democracy", "party"],
            "health": ["medical", "doctor", "hospital", "disease", "health",
                      "medicine", "patient", "treatment", "virus", "vaccine"]
        }
    def preprocess(self, text: str) -> List[str]:
        """文本预处理"""
        # 转换为小写,去除标点
        text = text.lower()
        text = re.sub(r'[^\w\s]', ' ', text)
        words = text.split()
        return words
    def predict(self, text: str, candidate_labels: List[str]) -> Dict:
        """基于关键词匹配的预测"""
        words = self.preprocess(text)
        word_counts = Counter(words)
        scores = {}
        for label in candidate_labels:
            score = 0
            if label in self.keyword_dict:
                keywords = self.keyword_dict[label]
                for keyword in keywords:
                    if keyword in word_counts:
                        # 使用TF-IDF简化版
                        tf = word_counts[keyword] / len(words) if len(words) > 0 else 0
                        score += tf * math.log(1 + 1/len(keywords))  # 简化IDF
            scores[label] = score
        # 归一化
        total = sum(scores.values())
        if total > 0:
            scores = {k: v/total for k, v in scores.items()}
        # 排序
        sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        return {
            "sequence": text,
            "prediction": sorted_scores[0][0] if sorted_scores else "unknown",
            "confidence": sorted_scores[0][1] if sorted_scores else 0,
            "scores": dict(sorted_scores)
        }
    def add_keywords(self, label: str, keywords: List[str]):
        """添加自定义关键词"""
        if label not in self.keyword_dict:
            self.keyword_dict[label] = []
        self.keyword_dict[label].extend(keywords)
if __name__ == "__main__":
    classifier = SimpleOpenClaw()
    # 测试
    text = "The new smartphone has amazing features and fast processor"
    result = classifier.predict(text, ["technology", "sports", "finance"])
    print(f"文本: {text}")
    print(f"预测: {result['prediction']}")
    print(f"置信度: {result['confidence']:.3f}")
    print("\n详细得分:")
    for label, score in result['scores'].items():
        print(f"  {label}: {score:.3f}")

安装和运行说明

对于方案一和方案二:

# 安装依赖
pip install torch transformers sentence-transformers
# 运行
python openclaw_local.py

对于方案三:

# 无需安装额外依赖
python simple_openclaw.py

特点:

  1. 完全离线运行:所有模型都从本地加载
  2. 零样本学习:不需要训练数据
  3. 灵活可扩展:支持自定义标签和模板
  4. 多种选择:提供三个不同复杂度的方案

建议:

  • 如果需要高准确度,使用方案一
  • 如果需要平衡速度和准确度,使用方案二
  • 如果完全不想安装深度学习库,使用方案三

你希望我详细解释哪个方案的原理,或者需要针对特定任务进行修改吗?

标签: 请输入需要生成关键词的具体内容 “深度学习 图像识别”

抱歉,评论功能暂时关闭!