improved imports, dummy dataset generation

This commit is contained in:
Florian Förster
2024-08-07 20:06:06 +02:00
parent 3f58a14852
commit 9328c0218a
35 changed files with 1966 additions and 106 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,243 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"text1 = \"Betriebssicherheitsüberprüfung\"\n",
"text3 = \"Ich habe die Betriebssicherheitsüberprüfung durchgeführt.\"\n",
"text2 = \"die Betriebssicherheitsüberprüfung durchgeführt\"\n",
"#text2 = \"Nach dem Batterie-Wechsel gingen alle Lichter aus\"\n",
"sentences = [text1, text2]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"text1 = \"Wöchentliche Sichtkontrolle / Reinigung\"\n",
"text3 = \"3-monatliche Sichtkontrolle / Reinigung\"\n",
"text2 = \"Wöchentliche Sichtkontrolle / Reinigun\"\n",
"#text2 = \"Nach dem Batterie-Wechsel gingen alle Lichter aus\"\n",
"sentences = [text1, text2]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"text1 = \"Wöchentliche Sichtkontrolle / Reinigung\"\n",
"text3 = \"Tägliche Kontrolle der Wasseraufbereitungsanlagen\"\n",
"text2 = \"Wöchentliche Kontrolle der Wasseraufbereitungsanlagen\"\n",
"text4 = \"Täglihce Kontolle der Wasseraufberitungsanlagen\"\n",
"#text2 = \"Nach dem Batterie-Wechsel gingen alle Lichter aus\"\n",
"sentences = [text1, text2]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#text1 = 'Tägliche Wartungstätigkeiten nach Vorgabe des Maschinenherstellers\\n'\n",
"#text3 = 'Tägliche Wartungstätigkeiten nach Vorgabe des Maschinenherstellers'"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\foersterflorian\\mambaforge\\envs\\test\\Lib\\site-packages\\torch\\_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
" return self.fget.__get__(instance, owner)()\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cosine-Similarity t1+2: tensor([[0.4740]])\n",
"Cosine-Similarity t1+3: tensor([[0.4360]])\n",
"Cosine-Similarity t2+3: tensor([[0.9494]])\n",
"Cosine-Similarity t2+4: tensor([[0.7007]])\n"
]
},
{
"data": {
"text/plain": [
"'\\n# Print the embeddings\\nfor sentence, embedding in zip(sentences, sentence_embeddings):\\n print(\"Sentence:\", sentence)\\n print(\"Embedding:\", embedding)\\n print(\"\")\\n'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sentence_transformers import SentenceTransformer, util\n",
"\n",
"#model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
"#model = SentenceTransformer(\"all-mpnet-base-v2 \")\n",
"model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')\n",
"\n",
"# Our sentences we like to encode\n",
"\"\"\"\n",
"sentences = [\n",
" \"This framework generates embeddings for each input sentence\",\n",
" \"Sentences are passed as a list of string.\",\n",
" \"The quick brown fox jumps over the lazy dog.\",\n",
"]\n",
"\"\"\"\n",
"\n",
"# Sentences are encoded by calling model.encode()\n",
"sentence_embeddings = model.encode(sentences)\n",
"t1 = model.encode(text1)\n",
"t2 = model.encode(text2)\n",
"t3 = model.encode(text3)\n",
"t4 = model.encode(text4)\n",
"\n",
"cos_sim = util.cos_sim(t1, t2)\n",
"print(\"Cosine-Similarity t1+2:\", cos_sim)\n",
"cos_sim = util.cos_sim(t1, t3)\n",
"print(\"Cosine-Similarity t1+3:\", cos_sim)\n",
"cos_sim = util.cos_sim(t2, t3)\n",
"print(\"Cosine-Similarity t2+3:\", cos_sim)\n",
"cos_sim = util.cos_sim(t2, t4)\n",
"print(\"Cosine-Similarity t2+4:\", cos_sim)\n",
"\n",
"\"\"\"\n",
"# Print the embeddings\n",
"for sentence, embedding in zip(sentences, sentence_embeddings):\n",
" print(\"Sentence:\", sentence)\n",
" print(\"Embedding:\", embedding)\n",
" print(\"\")\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"numpy.ndarray"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(t4)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"isinstance(model, int)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.7007368206977844"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cos_sim.item()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cosine-Similarity: tensor([[0.6153]])\n"
]
}
],
"source": [
"from sentence_transformers import SentenceTransformer, util\n",
"\n",
"model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
"\n",
"# Sentences are encoded by calling model.encode()\n",
"emb1 = model.encode(\"This is a red cat with a hat.\")\n",
"emb2 = model.encode(\"Have you seen my red cat?\")\n",
"\n",
"cos_sim = util.cos_sim(emb1, emb2)\n",
"print(\"Cosine-Similarity:\", cos_sim)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}