generative-ai-for-beginners/19-slm/python/phi35-vision-demo.ipynb
2024-09-12 20:45:37 +08:00

621 lines
20 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"source": [
"# pip install opencv-python"
],
"outputs": [],
"execution_count": 1,
"metadata": {
"gather": {
"logged": 1724068852431
}
}
},
{
"cell_type": "code",
"source": [
"# import cv2\n",
"# import numpy as np"
],
"outputs": [],
"execution_count": 2,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068852519
}
}
},
{
"cell_type": "code",
"source": [
"# def save_keyframes(video_path, output_folder):\n",
"# videoCapture = cv2.VideoCapture(video_path)\n",
"# success, frame = videoCapture.read()\n",
"# i = 0\n",
"# while success:\n",
"# gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)\n",
" \n",
"# hist = cv2.calcHist([gray_frame], [0], None, [256], [0, 256])\n",
" \n",
"# success, next_frame = videoCapture.read()\n",
"# if not success:\n",
"# break\n",
" \n",
"# next_gray_frame = cv2.cvtColor(next_frame, cv2.COLOR_BGR2GRAY)\n",
" \n",
"# next_hist = cv2.calcHist([next_gray_frame], [0], None, [256], [0, 256])\n",
" \n",
"# similarity = cv2.compareHist(hist, next_hist, cv2.HISTCMP_CORREL)\n",
" \n",
"# if similarity < 0.9:\n",
"# i += 1\n",
"# cv2.imwrite(f\"{output_folder}/keyframe_{i}.jpg\", frame)\n",
"# print(f\"Saved keyframe {i}\")\n",
" \n",
"# frame = next_frame\n",
"\n",
"# videoCapture.release()"
],
"outputs": [],
"execution_count": 3,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068852581
}
}
},
{
"cell_type": "code",
"source": [
"# save_keyframes('../video/copilot.mp4', '../output')"
],
"outputs": [],
"execution_count": 4,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068852723
}
}
},
{
"cell_type": "code",
"source": [
"from PIL import Image\n",
"import requests, base64"
],
"outputs": [],
"execution_count": 5,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068852785
}
}
},
{
"cell_type": "code",
"source": [
"images = [] \n",
"placeholder = \"\" \n",
"for i in range(1,22): \n",
" with open(\"../output/keyframe_\"+str(i)+\".jpg\", \"rb\") as f:\n",
"\n",
" images.append(Image.open(\"../output/keyframe_\"+str(i)+\".jpg\"))\n",
" placeholder += f\"<|image_{i}|>\\n\"\n",
" # print(i)"
],
"outputs": [],
"execution_count": 18,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724069071375
}
}
},
{
"cell_type": "code",
"source": [
"images"
],
"outputs": [
{
"output_type": "execute_result",
"execution_count": 19,
"data": {
"text/plain": "[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>,\n <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x720>]"
},
"metadata": {}
}
],
"execution_count": 19,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724069078111
}
}
},
{
"cell_type": "code",
"source": [
"from transformers import AutoModelForCausalLM \n",
"from transformers import AutoProcessor"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": "2024-08-19 12:00:55.508879: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n2024-08-19 12:00:55.508910: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
}
],
"execution_count": 8,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068856588
}
}
},
{
"cell_type": "code",
"source": [
"model_id = \"../Phi3Vision\""
],
"outputs": [],
"execution_count": 9,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068856676
}
}
},
{
"cell_type": "code",
"source": [
"model = AutoModelForCausalLM.from_pretrained(model_id, device_map=\"cuda\", trust_remote_code=True, torch_dtype=\"auto\", _attn_implementation='flash_attention_2')"
],
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "4e678ae3115b4d89bff5caf06c97933d"
}
},
"metadata": {}
}
],
"execution_count": 10,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068938629
}
}
},
{
"cell_type": "code",
"source": [
"messages = [\n",
" {\"role\": \"user\", \"content\": placeholder+\"Summarize the video.\"}, \n",
"]"
],
"outputs": [],
"execution_count": 11,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068938692
}
}
},
{
"cell_type": "code",
"source": [
"pip install transformers -U"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Requirement already satisfied: transformers in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (4.44.0)\nRequirement already satisfied: safetensors>=0.4.1 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (0.4.3)\nRequirement already satisfied: requests in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (2.31.0)\nRequirement already satisfied: pyyaml>=5.1 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (6.0.1)\nRequirement already satisfied: huggingface-hub<1.0,>=0.23.2 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (0.24.5)\nRequirement already satisfied: filelock in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (3.14.0)\nRequirement already satisfied: tqdm>=4.27 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (4.66.2)\nRequirement already satisfied: regex!=2019.12.17 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (2024.4.28)\nRequirement already satisfied: packaging>=20.0 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (24.0)\nRequirement already satisfied: numpy>=1.17 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (1.23.5)\nRequirement already satisfied: tokenizers<0.20,>=0.19 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from transformers) (0.19.1)\nRequirement already satisfied: typing-extensions>=3.7.4.3 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (4.12.2)\nRequirement already satisfied: fsspec>=2023.5.0 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from huggingface-hub<1.0,>=0.23.2->transformers) (2023.10.0)\nRequirement already satisfied: certifi>=2017.4.17 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from requests->transformers) (2022.9.24)\nRequirement already satisfied: charset-normalizer<4,>=2 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from requests->transformers) (3.3.2)\nRequirement already satisfied: idna<4,>=2.5 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from requests->transformers) (3.7)\nRequirement already satisfied: urllib3<3,>=1.21.1 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from requests->transformers) (1.26.18)\nNote: you may need to restart the kernel to use updated packages.\n"
}
],
"execution_count": 12,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068942055
}
}
},
{
"cell_type": "code",
"source": [
"from transformers import AutoModelForCausalLM \n",
"from transformers import AutoProcessor\n",
"\n",
"\n",
"# from image_embedding_phi3_v import Phi3VImageProcessor \n",
"\n",
"# transformers.Phi3VImageProcessor = Phi3VImageProcessor "
],
"outputs": [],
"execution_count": 13,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068942117
}
}
},
{
"cell_type": "code",
"source": [
"processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=4)"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": "/anaconda/envs/azureml_py38/lib/python3.9/site-packages/transformers/models/auto/image_processing_auto.py:513: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead\n warnings.warn(\n"
}
],
"execution_count": 14,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068942746
}
}
},
{
"cell_type": "code",
"source": [
"pip install jinja2 -U"
],
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": "Requirement already satisfied: jinja2 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (3.1.4)\nRequirement already satisfied: MarkupSafe>=2.0 in /anaconda/envs/azureml_py38/lib/python3.9/site-packages (from jinja2) (2.0.1)\nNote: you may need to restart the kernel to use updated packages.\n"
}
],
"execution_count": 15,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068946299
}
}
},
{
"cell_type": "code",
"source": [
"prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)"
],
"outputs": [],
"execution_count": 16,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724068946385
}
}
},
{
"cell_type": "code",
"source": [
"inputs = processor(prompt, images, return_tensors=\"pt\").to(\"cuda:0\")"
],
"outputs": [],
"execution_count": 20,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724069088259
}
}
},
{
"cell_type": "code",
"source": [
"generation_args = { \"max_new_tokens\": 1000, \"temperature\": 0.0, \"do_sample\": False, }"
],
"outputs": [],
"execution_count": 21,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724069108248
}
}
},
{
"cell_type": "code",
"source": [
"generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)"
],
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": "/anaconda/envs/azureml_py38/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:567: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.0` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n warnings.warn(\nThe `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.\n"
}
],
"execution_count": 22,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724069135074
}
}
},
{
"cell_type": "code",
"source": [
"generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]"
],
"outputs": [],
"execution_count": 23,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724069138113
}
}
},
{
"cell_type": "code",
"source": [
"response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
],
"outputs": [],
"execution_count": 24,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724069148861
}
}
},
{
"cell_type": "code",
"source": [
"response"
],
"outputs": [
{
"output_type": "execute_result",
"execution_count": 25,
"data": {
"text/plain": "\"The video appears to be a promotional or informational piece about a product or service called 'Copilot'. It showcases various individuals in different settings, such as an office and a home office, engaging with the product. The video includes text overlays that suggest the product can help with tasks like lowering production costs, managing meetings, and creating presentations. There are also screenshots of a chat interface and a summary of a product launch discussion, indicating that the product may be related to project management or collaboration. The video seems to emphasize the efficiency and effectiveness of the Copilot product in a professional context.\""
},
"metadata": {}
}
],
"execution_count": 25,
"metadata": {
"jupyter": {
"source_hidden": false,
"outputs_hidden": false
},
"nteract": {
"transient": {
"deleting": false
}
},
"gather": {
"logged": 1724069154901
}
}
}
],
"metadata": {
"kernelspec": {
"name": "python38-azureml",
"language": "python",
"display_name": "Python 3.8 - AzureML"
},
"language_info": {
"name": "python",
"version": "3.9.19",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
},
"microsoft": {
"ms_spell_check": {
"ms_spell_check_language": "en"
},
"host": {
"AzureML": {
"notebookHasBeenCompleted": true
}
}
},
"kernel_info": {
"name": "python38-azureml"
},
"nteract": {
"version": "nteract-front-end@1.0.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}