Ce serveur Gitlab sera éteint le 30 juin 2020, pensez à migrer vos projets vers les serveurs gitlab-research.centralesupelec.fr et gitlab-student.centralesupelec.fr !

Commit 5b60da5c authored by Hachemin Pierre-Yves's avatar Hachemin Pierre-Yves

Merge branch 'master' into 'PY/InputYears'

# Conflicts:
#   Extract_spectrum.py
parents 8afa5c9a 839450a6
......@@ -25,6 +25,7 @@ spectrumDir = '../spectrumImages/spectrumImages2008/' # Output folder to store t
countDownload = 1
countSpectrum = 1
def main():
global videoDir
global spectrumDir
......@@ -41,26 +42,26 @@ def main():
follow = False
SPECTRUM.join()
def downloadTrailer(): #called by main()
def downloadTrailer(): # called by main()
"""Function downloading the trailer thanks to the youtube-dl library"""
global countDownload
global exceptDict
ydl_opts = {'format': 'worst/worstvideo',
'outtmpl': videoDir+'%(id)s.%(ext)s',
'outtmpl': videoDir + '%(id)s.%(ext)s',
'noplaylist': True,
'nocheckcertificate':True,
#'max_filesize' : 10000000,
"nocheckcertificate": True,
'save_path' : videoDir }
'nocheckcertificate': True,
# 'max_filesize' : 10000000,
'save_path': videoDir}
for key in linkDict.keys():
if not os.path.isfile(spectrumDir+linkDict[key][2]+'.jpg'):
if not os.path.isfile(spectrumDir + linkDict[key][2] + '.jpg'):
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download(['https://www.youtube.com/watch?v=' + linkDict[key][2]])
info_dict = ydl.extract_info('https://www.youtube.com/watch?v=' + linkDict[key][2], download=False)
video_ext = info_dict.get("ext", None)
max_filesize=info_dict.get('filesize',None)
#print("max_filesize : {}".format(max_filesize))
max_filesize = info_dict.get('filesize', None)
# print("max_filesize : {}".format(max_filesize))
queue.put(linkDict[key][2] + '.' + video_ext)
except Exception as e: # catch *all* exceptions
exceptDict[linkDict[key][2]] = str(e)
......@@ -70,25 +71,26 @@ def downloadTrailer(): #called by main()
print("{}/{} downloads".format(countDownload, len(linkDict)))
else:
print(spectrumDir+linkDict[key][2]+'.jpg already exists !')
print(spectrumDir + linkDict[key][2] + '.jpg already exists !')
countDownload += 1
def createSpectrum(): #called by main()
def createSpectrum(): # called by main()
"""Function creating the spectrum from the full queue"""
global countSpectrum
global spectrumDir
while (not queue.empty()) or follow:
item = queue.get()
imgSpectrum(videoDir+item, spectrumDir + item[:-4] + '.jpg')
imgSpectrum(videoDir + item, spectrumDir + item[:-4] + '.jpg')
print(item[:-4])
global countDownload
print("{}/{} spectrum".format(countSpectrum, len(linkDict)))
countSpectrum += 1
os.remove(videoDir+item)
os.remove(videoDir + item)
queue.task_done()
def imgSpectrum(vidPath, spectrumOut): #called by createSpectrum()
def imgSpectrum(vidPath, spectrumOut): # called by createSpectrum()
"""Function saving the image thanks to the spectrum array """
start = time()
if vidPath[-3:] != '3gp':
......@@ -99,10 +101,11 @@ def imgSpectrum(vidPath, spectrumOut): #called by createSpectrum()
cv2.imwrite(spectrumOut, output)
print()
print("--------------------------------------------------------")
print("Duration for {} : {} s".format(vidPath[:-3],time() - start))
print("Duration for {} : {} s".format(vidPath[:-3], time() - start))
print("--------------------------------------------------------")
def resizeVideo(vidIn, vidOut): #called by imgSpectrum()
def resizeVideo(vidIn, vidOut): # called by imgSpectrum()
"""Function to resize the video in case it is too big (mp4 format)"""
try:
clip = mp.VideoFileClip(vidIn)
......@@ -113,12 +116,13 @@ def resizeVideo(vidIn, vidOut): #called by imgSpectrum()
return vidOut
return vidOut
def tableSpectrum(video): #called by imgSpectrum()
def tableSpectrum(video): # called by imgSpectrum()
"""Function creating the array for the spectrum"""
count = 0
ratioFrame = 6
countFrame = 0
vidcrop=cv2.VideoCapture(video)
vidcrop = cv2.VideoCapture(video)
"""Find if the trailer is in 4:3 or 21:9 --> borders to crop"""
success, image = vidcrop.read()
......@@ -145,7 +149,7 @@ def tableSpectrum(video): #called by imgSpectrum()
while success:
if countFrame == ratioFrame:
countFrame = 0
res[0][count] = meanImage(image,cropdim)
res[0][count] = meanImage(image, cropdim)
for i in range(1, res.shape[0]):
res[i][count] = res[0][count]
count += 1
......@@ -156,39 +160,40 @@ def tableSpectrum(video): #called by imgSpectrum()
return res
def crop_image_dim(img,tol): #called by tableSpectrum()
"""Find the dimensions cropped if this is a 4:3 or 21:9 trailer"""
#tol is the tolerance : 0 to find black pixels
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
mask = img_gray>tol
midV=floor(mask.shape[0]/2)
midH=floor(mask.shape[1]/2)
upBorder=0
downBorder=mask.shape[0]
leftBorder=0
rightBorder=mask.shape[1]
for i in range(1,mask.shape[0]):
if mask[i-1][midH]!=mask[i][midH] and mask[i][midH]==True:
if upBorder==mask[0][midH]:
upBorder=i-1
if mask[i-1][midH]!=mask[i][midH] and mask[i][midH]==False:
downBorder=i
for j in range(1,mask.shape[1]):
if mask[midV][j-1]!=mask[midV][j] and mask[midV][j]==True:
if leftBorder==mask[midV][0]:
leftBorder=j-1
if mask[midV][j-1]!=mask[midV][j] and mask[midV][j]==False:
rightBorder=j
return (upBorder,downBorder, leftBorder, rightBorder)
def meanImage(img,cropdim): #called by tableSpectrum()
def crop_image_dim(img, tol): # called by tableSpectrum()
"""Find the dimensions cropped if this is a 4:3 or 21:9 trailer"""
# tol is the tolerance : 0 to find black pixels
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
mask = img_gray > tol
midV = floor(mask.shape[0] / 2)
midH = floor(mask.shape[1] / 2)
upBorder = 0
downBorder = mask.shape[0]
leftBorder = 0
rightBorder = mask.shape[1]
for i in range(1, mask.shape[0]):
if mask[i - 1][midH] != mask[i][midH] and mask[i][midH] == True:
if upBorder == mask[0][midH]:
upBorder = i - 1
if mask[i - 1][midH] != mask[i][midH] and mask[i][midH] == False:
downBorder = i
for j in range(1, mask.shape[1]):
if mask[midV][j - 1] != mask[midV][j] and mask[midV][j] == True:
if leftBorder == mask[midV][0]:
leftBorder = j - 1
if mask[midV][j - 1] != mask[midV][j] and mask[midV][j] == False:
rightBorder = j
return (upBorder, downBorder, leftBorder, rightBorder)
def meanImage(img, cropdim): # called by tableSpectrum()
"""Return the mean LAB array only with pixels in cropped image"""
res0 = [0, 0, 0]
res = [0, 0, 0]
lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
for i in range(cropdim[0],cropdim[1]):
for j in range(cropdim[2],cropdim[3]):
for i in range(cropdim[0], cropdim[1]):
for j in range(cropdim[2], cropdim[3]):
res0 = res0 + lab[i][j]
sumpix=(cropdim[1]-cropdim[0]+1)*(cropdim[3]-cropdim[2]+1)
res0[0]= res0[0] / sumpix
......@@ -200,12 +205,13 @@ def meanImage(img,cropdim): #called by tableSpectrum()
return res
def reading(path): #Called by main()
def reading(path): # Called by main()
"""Function to create the dictionnary from the text file"""
with open(path, 'r') as f:
s = f.read()
whip = literal_eval(s)
return whip
if __name__ == '__main__':
main()
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. CNN"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Récupération des genres"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Permet de récupérer les labels qui seront mis dans une array"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import pandas as pd\n",
"from __future__ import absolute_import\n",
"from __future__ import division\n",
"from __future__ import print_function\n",
"from keras.preprocessing.sequence import pad_sequences\n",
"pd.options.mode.chained_assignment = None\n",
"import argparse\n",
"import sys\n",
"import numpy as np\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.image as mpimg\n",
"import tensorflow as tf\n",
"import os\n",
"import cv2\n",
"from math import floor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Choisir l'année !"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"years = [2004, 2005, 2006, 2007, 2008]"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
"list_of_eligible_spectrums = []\n",
"for year in years:\n",
" for file in os.listdir(\"spectrumImages/SpectrumImages\" + str(year)):\n",
" if str(file)[-4:] == '.jpg':\n",
" list_of_eligible_spectrums += ['SpectrumImages'+ str(year) +'/' + file]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We select: Action / Comedy / Thriller / Horror / Drama\n",
" 28 / 35 / 53/ 27 / 18"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"{\"genres\":[{\"id\":28,\"name\":\"Action\"},{\"id\":12,\"name\":\"Adventure\"},{\"id\":16,\"name\":\"Animation\"},{\"id\":35,\"name\":\"Comedy\"},{\"id\":80,\"name\":\"Crime\"},{\"id\":99,\"name\":\"Documentary\"},{\"id\":18,\"name\":\"Drama\"},{\"id\":10751,\"name\":\"Family\"},{\"id\":14,\"name\":\"Fantasy\"},{\"id\":36,\"name\":\"History\"},{\"id\":27,\"name\":\"Horror\"},{\"id\":10402,\"name\":\"Music\"},{\"id\":9648,\"name\":\"Mystery\"},{\"id\":10749,\"name\":\"Romance\"},{\"id\":878,\"name\":\"Science Fiction\"},{\"id\":10770,\"name\":\"TV Movie\"},{\"id\":53,\"name\":\"Thriller\"},{\"id\":10752,\"name\":\"War\"},{\"id\":37,\"name\":\"Western\"}]}"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n",
"do something! Too many points.......\n"
]
}
],
"source": [
"genres = [28, 35, 18, 99, 10749, 10752, 10402, 53, 878, 27, 9648, 80, 14, 12, 36, 10769, 16, 10751, 37, 10770]\n",
"\n",
"wanted_genres = {28: 'action', 35: 'comedy', 18: 'drama', 99: 0, 10749: 'comedy',10752: 'action', 10402: 0, 53: 'thriller', 878: 'action', 27: 'horror', 9648: 'thriller', 80: 'thriller', 14: 'action', 12: 'action', 36: 0, 10769: 0, 16: 0, 10751: 0, 37: 'action', 10770: 0}\n",
"\n",
"def get_genre_from_link():\n",
" dict_inverse = {}\n",
" links_to_be_removed = []\n",
" for year in years:\n",
" path = \"./Link-dictionaries/Link-dictionary\" + str(year)+ \".txt\"\n",
" file = open(path, \"r\").read()\n",
" dictyear = ast.literal_eval(file)\n",
" for movie_id in dictyear.keys():\n",
" if dictyear[movie_id][1] != []:\n",
" dict_inverse[str(dictyear[movie_id][2])] = {}\n",
" movie_genres = dictyear[movie_id][1]\n",
" if wanted_genres[movie_genres[0]] != 0:\n",
" dict_inverse[str(dictyear[movie_id][2])]['genre'] = wanted_genres[movie_genres[0]]\n",
" elif len(movie_genres)>1:\n",
" if wanted_genres[movie_genres[1]] != 0:\n",
" dict_inverse[str(dictyear[movie_id][2])]['genre'] = wanted_genres[movie_genres[1]]\n",
" elif len(movie_genres)>2:\n",
" if wanted_genres[movie_genres[2]] != 0:\n",
" dict_inverse[str(dictyear[movie_id][2])]['genre'] = wanted_genres[movie_genres[2]]\n",
" else:\n",
" links_to_be_removed += [dictyear[movie_id][2]]\n",
" else:\n",
" links_to_be_removed += [dictyear[movie_id][2]]\n",
" else:\n",
" links_to_be_removed += [dictyear[movie_id][2]]\n",
" else:\n",
" links_to_be_removed += [dictyear[movie_id][2]]\n",
" return dict_inverse, links_to_be_removed\n",
"\n",
"def get_output_list(L):\n",
" dict_inverse, links_to_be_removed = get_genre_from_link()\n",
" eligible_links = []\n",
" output = []\n",
" for link in L:\n",
" link = str(link)\n",
" #print(dict_inverse[str(link)])\n",
" if link[-5] == \".\":\n",
" link = link[:-4] + link[-3:]\n",
" print(\"do something! Too many points.......\")\n",
" if link.split('/')[1][:-4] not in links_to_be_removed:\n",
" eligible_links += [link[:-4]]\n",
" return dict_inverse, eligible_links\n",
"\n",
"\n",
"dict_inverse, eligible_links = get_output_list(list_of_eligible_spectrums)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#for element in labels:\n",
"# for genre in element:\n",
"# if genre not in genres:\n",
"# genres += [genre]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#Modif pour ajouter des catégories\n",
"trY[2][3]=1\n",
"trY.shape\n",
"from random import randint\n",
"for i in range(1225):\n",
" rand = randint(0,19)\n",
" trY[i][rand] = 1\n",
"trY[3]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Bien vérifier la taille des données !"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extraction des images"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n",
"'NoneType' object is not subscriptable\n"
]
}
],
"source": [
"for file in eligible_links:\n",
" img = cv2.imread('SpectrumImages/'+ file + '.jpg', 1)\n",
" try:\n",
" img = img[0:1]\n",
" except Exception as e:\n",
" print(e)\n",
" img = cv2.imread('SpectrumImages/'+ file + '..jpg', 1)\n",
" img = img[0:1]\n",
" img = img.reshape((img.shape[1], img.shape[2]))\n",
" dict_inverse[file.split('/')[1]]['image'] = img"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df = pd.DataFrame.from_dict(dict_inverse)\n",
"df = df.transpose()\n",
"df = df.reset_index(drop=True)\n",
"#shuffling\n",
"df = df.sample(frac=1)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>genre</th>\n",
" <th>image</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2812</td>\n",
" <td>1960</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>5</td>\n",
" <td>1960</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>drama</td>\n",
" <td>[[9, 0, 3], [9, 1, 1], [10, 4, 0], [20, 17, 3]...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>891</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" genre image\n",
"count 2812 1960\n",
"unique 5 1960\n",
"top drama [[9, 0, 3], [9, 1, 1], [10, 4, 0], [20, 17, 3]...\n",
"freq 891 1"
]
},
"execution_count": 115,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6981, 2)"
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df2 = df.dropna(axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3515, 2)"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.shape"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>image</th>\n",
" </tr>\n",
" <tr>\n",
" <th>genre</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>action</th>\n",
" <td>864</td>\n",
" </tr>\n",
" <tr>\n",
" <th>comedy</th>\n",
" <td>955</td>\n",
" </tr>\n",
" <tr>\n",
" <th>drama</th>\n",
" <td>1079</td>\n",
" </tr>\n",
" <tr>\n",
" <th>horror</th>\n",
" <td>312</td>\n",
" </tr>\n",
" <tr>\n",
" <th>thriller</th>\n",
" <td>305</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" image\n",
"genre \n",
"action 864\n",
"comedy 955\n",
"drama 1079\n",
"horror 312\n",
"thriller 305"
]
},
"execution_count": 144,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.groupby('genre').count()"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [],
"source": [
"df3 = pd.get_dummies(df2,columns=['genre'])"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2812"
]
},
"execution_count": 146,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_len = int(df3.shape[0]*0.8)\n",
"train_len"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train = df3.iloc[:train_len, :]\n",
"test = df3.iloc[train_len:, :]"
]
},
{
"cell_type": "code",
"execution_count": 148,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2475 [[15, 11, 6], [17, 14, 0], [17, 15, 0], [14, 1...\n",
"5792 [[25, 17, 10], [20, 18, 7], [22, 19, 4], [27, ...\n",
"3475 [[95, 38, 0], [95, 38, 0], [95, 38, 0], [95, 3...\n",
"4959 [[19, 0, 7], [0, 12, 0], [48, 93, 30], [41, 99...\n",
"5825 [[28, 116, 0], [32, 114, 1], [34, 114, 1], [30...\n",
"Name: image, dtype: object"
]
},
"execution_count": 148,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train['image'].head()"
]
},
{
"cell_type": "code",
"execution_count": 149,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((2812, 6), (703, 6))"
]
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.shape, test.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},