From 32dd89caead0dff1c8f23c3535cd357f814bb9a9 Mon Sep 17 00:00:00 2001
From: Constantin Wenger <constantin.wenger@googlemail.com>
Date: Thu, 13 Jun 2019 11:12:44 +0000
Subject: [PATCH] fixed some things and made hash generation and image download parallel also added set icon download (it needs the sets.json from scryfall in data dir)

---
 fetch_data.py  |    7 +++
 opencv_dnn.py  |   61 +++++++++++++++++++-----------
 fetch_icons.py |   21 ++++++++++
 3 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/fetch_data.py b/fetch_data.py
index 00bf592..500dc55 100644
--- a/fetch_data.py
+++ b/fetch_data.py
@@ -81,9 +81,14 @@
         # df is a single row of card
         fetch_card_image(df, out_dir, size)
     else:
+        from concurrent.futures import ThreadPoolExecutor, wait as fwait
+        executor = ThreadPoolExecutor(5)
         # df is a dataframe containing list of cards
+        arglist = []
         for ind, row in df.iterrows():
-            fetch_card_image(row, out_dir, size)
+            arglist.append(executor.submit(fetch_card_image, row, out_dir, size))
+        fwait(arglist)
+        #    fetch_card_image(row, out_dir, size)
 
 
 def fetch_card_image(row, out_dir=None, size='png'):
diff --git a/fetch_icons.py b/fetch_icons.py
new file mode 100644
index 0000000..0396cee
--- /dev/null
+++ b/fetch_icons.py
@@ -0,0 +1,21 @@
+#!/bin/python3
+import sys
+import json
+import os
+from config import Config
+from urllib import request
+
+def main(args):
+    setdata = None
+    with open(os.path.join(Config.data_dir, 'sets.json'), 'rt') as setfile:
+        setdata = json.load(setfile)
+    for mset in setdata['data']:
+        if len(mset['code']) > 3:  # not an official set
+            continue
+
+        request.urlretrieve(mset['icon_svg_uri'], filename=os.path.join(Config.data_dir, 'icons', mset['code']+'.svg'))
+
+
+
+if __name__ == '__main__':
+    main(sys.argv)
diff --git a/opencv_dnn.py b/opencv_dnn.py
index 7801bc3..9621267 100644
--- a/opencv_dnn.py
+++ b/opencv_dnn.py
@@ -9,7 +9,7 @@
 import pandas as pd
 from PIL import Image
 import time
-
+from multiprocessing import Pool
 from config import Config
 import fetch_data
 
@@ -22,25 +22,13 @@
 https://github.com/hj3yoo/mtg_card_detector/tree/dea64611730c84a59c711c61f7f80948f82bcd31 
 """
 
-
-def calc_image_hashes(card_pool, save_to=None, hash_size=None):
-    """
-    Calculate perceptual hash (pHash) value for each cards in the database, then store them if needed
-    :param card_pool: pandas dataframe containing all card information
-    :param save_to: path for the pickle file to be saved
-    :param hash_size: param for pHash algorithm
-    :return: pandas dataframe
-    """
-    if hash_size is None:
-        hash_size = [16, 32]
-    elif isinstance(hash_size, int):
-        hash_size = [hash_size]
-    
-    # Since some double-faced cards may result in two different cards, create a new dataframe to store the result
+def do_calc(args):
+    card_pool = args[0]
+    hash_size = args[1]
     new_pool = pd.DataFrame(columns=list(card_pool.columns.values))
     for hs in hash_size:
-            new_pool['card_hash_%d' % hs] = np.NaN
-            #new_pool['art_hash_%d' % hs] = np.NaN
+        new_pool['card_hash_%d' % hs] = np.NaN
+        #new_pool['art_hash_%d' % hs] = np.NaN
     for ind, card_info in card_pool.iterrows():
         if ind % 100 == 0:
             print('Calculating hashes: %dth card' % ind)
@@ -82,6 +70,29 @@
                 #art_hash = ih.phash(img_art, hash_size=hs)
                 #card_info['art_hash_%d' % hs] = art_hash
             new_pool.loc[0 if new_pool.empty else new_pool.index.max() + 1] = card_info
+    return new_pool
+
+def calc_image_hashes(card_pool, save_to=None, hash_size=None):
+    """
+    Calculate perceptual hash (pHash) value for each cards in the database, then store them if needed
+    :param card_pool: pandas dataframe containing all card information
+    :param save_to: path for the pickle file to be saved
+    :param hash_size: param for pHash algorithm
+    :return: pandas dataframe
+    """
+    if hash_size is None:
+        hash_size = [16, 32]
+    elif isinstance(hash_size, int):
+        hash_size = [hash_size]
+
+    num_cores = 15
+    num_partitions = 60
+    pool = Pool(num_cores)
+    df_split = np.array_split(card_pool, num_partitions)
+    new_pool = pd.concat(pool.map(do_calc, [(split, hash_size) for split in df_split]))
+    pool.close()
+    pool.join()
+    # Since some double-faced cards may result in two different cards, create a new dataframe to store the result
 
     if save_to is not None:
         new_pool.to_pickle(save_to)
@@ -217,7 +228,7 @@
     img_erode = cv2.erode(img_dilate, kernel, iterations=1)
 
     # Find the contour
-    _, cnts, hier = cv2.findContours(img_erode, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+    cnts, hier = cv2.findContours(img_erode, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
     if len(cnts) == 0:
         #print('no contours')
         return []
@@ -358,7 +369,7 @@
                     cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
         if debug:
             # cv2.rectangle(img_warp, (22, 47), (294, 249), (0, 255, 0), 2)
-            cv2.putText(img_warp, card_name + ', ' + str(hash_diff), (0, 20),
+            cv2.putText(img_warp, card_name + ':' + card_set + ', ' + str(hash_diff), (0, 20),
                         cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1)
             cv2.imshow('card#%d' % i, img_warp)
     if display:
@@ -467,7 +478,8 @@
 
 def main(args):
     # Specify paths for all necessary files
-
+    hash_sizes = {16, 32}
+    hash_sizes.add(args.hash_size)
     pck_path = os.path.abspath('card_pool.pck')
     if os.path.isfile(pck_path):
         card_pool = pd.read_pickle(pck_path)
@@ -482,8 +494,13 @@
         card_pool = pd.concat(df_list, sort=True)
         card_pool.reset_index(drop=True, inplace=True)
         card_pool.drop('Unnamed: 0', axis=1, inplace=True, errors='ignore')
-        calc_image_hashes(card_pool, save_to=pck_path)
+        card_pool = calc_image_hashes(card_pool, save_to=pck_path, hash_size=hash_sizes)
     ch_key = 'card_hash_%d' % args.hash_size
+    if ch_key not in card_pool.columns:
+        # we did not generate this hash_size yet
+        print('We need to add hash_size=%d' % (args.hash_size,))
+        card_pool = calc_image_hashes(card_pool, save_to=pck_path, hash_size=[args.hash_size])
+
     card_pool = card_pool[['name', 'set', 'collector_number', ch_key]]
 
     # Processing time is almost linear to the size of the database

--
Gitblit v1.10.0