Various updates for 02465 during semester

e19bb48a · tuhe · a6f84c4c · e19bb48a · e19bb48a · e19bb48a
Commit e19bb48a authored Mar 19, 2022 by tuhe
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
 # beamer-slider
 setuptools.setup(
    name="coursebox",
-    version="0.1.2",
+    version="0.1.4",
    author="Tue Herlau",
    author_email="tuhe@dtu.dk",
    description="A course management system currently used at DTU",
@@ -30,5 +30,5 @@ setuptools.setup(
    package_dir={"": "src"},
    packages=setuptools.find_packages(where="src"),
    python_requires=">=3.8",
-    install_requires=['numpy','pycode_similar','tika','openpyxl', 'xlwings','matplotlib','langdetect','jinjafy','beamer-slider','tinydb'],
+    install_requires=['numpy','pycode_similar','tika','openpyxl', 'xlwings','matplotlib','langdetect','beamer-slider','tinydb'],
 )
--- a/src/coursebox.egg-info/PKG-INFO
+++ b/src/coursebox.egg-info/PKG-INFO
 Metadata-Version: 2.1
 Name: coursebox
-Version: 0.1.1
+Version: 0.1.4
 Summary: A course management system currently used at DTU
 Home-page: https://lab.compute.dtu.dk/tuhe/coursebox
 Author: Tue Herlau
@@ -16,6 +16,34 @@ Description-Content-Type: text/markdown
 License-File: LICENSE

 # Coursebox DTU
-
 DTU course management software.

+## Installation
+```terminal
+pip install coursebox
+```
+## What it can do 
+ - Single semester-dependent configuration file
+ - Integrates with DTU Inside/DTU Learn
+ - Distribution/evalauation of project reports in Learn-compatible format
+ - Quiz-generation in DTU Learn/Beamer friendly format
+ - Automatic website/syllabus generation 
+ - Automatic generation of lectures handouts/exercises (you don't have to track dynamic content like dates/lecture titles; it is all in the configuration)
+ - Easy compilation to 2/5 day formats (Continuous education)
+
+## Usage
+Coursebox requires a specific directory structure. It is easier to start with an existing course and adapt to your needs. Please contact me at tuhe@dtu.dk for more information.
+
+## Citing
+```bibtex
+@online{coursebox,
+	title={Coursebox (0.1.1): \texttt{pip install coursebox}},
+	url={https://lab.compute.dtu.dk/tuhe/coursebox},
+	urldate = {2021-09-07}, 
+	month={9},
+	publisher={Technical University of Denmark (DTU)},
+	author={Tue Herlau},
+	year={2021},
+}
+```
+
--- a/src/coursebox.egg-info/requires.txt
+++ b/src/coursebox.egg-info/requires.txt
-openpyxl
+numpy
+pycode_similar
 tika
+openpyxl
 xlwings
-pybtex
-langdetect
-wexpect
-pexpect
 matplotlib
-numpy
-pycode_similar
-jinjafy
+langdetect
 beamer-slider
 tinydb
--- a/src/coursebox/core/__pycache__/info.cpython-38.pyc
+++ b/src/coursebox/core/__pycache__/info.cpython-38.pyc
--- a/src/coursebox/core/__pycache__/projects_info.cpython-38.pyc
+++ b/src/coursebox/core/__pycache__/projects_info.cpython-38.pyc
--- a/src/coursebox/core/info.py
+++ b/src/coursebox/core/info.py
@@ -12,10 +12,14 @@ from coursebox.core.info_paths import core_conf
 # import pybtex.database.input.bibtex
 # import pybtex.plugin
 # import io
+from line_profiler_pycharm import profile
+import time

-
+@profile
 def xlsx_to_dicts(xlsx_file,sheet=None, as_dict_list=False):
-    wb = openpyxl.load_workbook(xlsx_file, data_only=True)
+    # print("Loading...", xlsx_file, sheet, as_dict_list)
+    t0 = time.time()
+    wb = openpyxl.load_workbook(xlsx_file, data_only=True, read_only=True)
    if not sheet:
        ws = wb.worksheets[0]
    else:
@@ -24,26 +28,65 @@ def xlsx_to_dicts(xlsx_file,sheet=None, as_dict_list=False):
            return None
        else:
            ws = ws.pop()
-    dd = []
-    key_cols = [j for j in range(ws.max_column) if ws.cell(row=1, column=j + 1).value is not None]
-    for i in range(1, ws.max_row):
-        rdict = {}
-        if not any( [ws.cell(row=i+1, column=j+1).value is not None for j in key_cols] ):
-            continue
-        for j in key_cols:
-            key = ws.cell(row=1, column=j+1).value
-            if key is not None:
-                key = key.strip() if isinstance(key,str) else key
-                value = ws.cell(row=i + 1, column=j + 1).value
-                value = value.strip() if isinstance(value,str) else value
-                if isinstance(value, str):
-                    if value == 'True':
-                        value = True
-                    if value == 'False':
-                        value = False
-                rdict[key] = value
-        dd.append(rdict)
-
+    # print(time.time()-t0)
+    # dd = []
+    # key_cols = [j for j in range(ws.max_column) if ws.cell(row=1, column=j + 1).value is not None]
+    # print(time.time()-t0, ws.max_row)
+    # np.array([[i.value for i in j[1:5]] for j in ws.rows])
+
+    import numpy as np
+    A = np.array([[i.value for i in j] for j in ws.rows])
+    # print(time.time() - t0, ws.max_row, len(key_cols))
+
+
+    # for j in range(A.shape[1]):
+
+
+
+
+    a = 234
+
+    # for i in range(1, ws.max_row):
+    #     rdict = {}
+    #     if not any( [ws.cell(row=i+1, column=j+1).value is not None for j in key_cols] ):
+    #         continue
+    #     for j in key_cols:
+    #         key = ws.cell(row=1, column=j+1).value
+    #         if key is not None:
+    #             key = key.strip() if isinstance(key,str) else key
+    #             value = ws.cell(row=i + 1, column=j + 1).value
+    #             value = value.strip() if isinstance(value,str) else value
+    #             if isinstance(value, str):
+    #                 if value == 'True':
+    #                     value = True
+    #                 if value == 'False':
+    #                     value = False
+    #             rdict[key] = value
+    #     dd.append(rdict)
+
+    # print(time.time()-t0)
+
+    A = A[:, A[0] != None]
+    A = A[(A != None).sum(axis=1) > 0, :]
+
+    dd2 = []
+    for i in range(1, A.shape[0]):
+        A[A == 'True'] = True
+        A[A == 'False'] = False
+
+        d = dict(zip(A[0, :].tolist(), [a.strip() if isinstance(a,str) else a for a in A[i, :].tolist() ]))
+        dd2.append(d)
+
+    # print(time.time() - t0)
+    dd = dd2
+    # if dd != dd2:
+    #     for k in range(len(dd)):
+    #         if dd[k] != dd2[k]:
+    #             print(k)
+    #             print(dd)
+    #             print(dd2)
+    #     assert False
+    #     print("BAd!")
    if as_dict_list:
        dl = list_dict2dict_list(dd)
        for k in dl.keys():
@@ -51,6 +94,8 @@ def xlsx_to_dicts(xlsx_file,sheet=None, as_dict_list=False):
            if len(x) == 1: x = x.pop()
            dl[k] = x
        dd = dl
+    wb.close()
+    # print("xlsx2dicts", time.time()-t0)
    return dd

 def get_enrolled_students():
@@ -200,6 +245,7 @@ def get_forum(paths):
        d2.append({k: v[i] for k, v in dd.items()})
    return d2

+@profile
 def class_information():
    course_number = core_conf['course_number']
    piazza = 'https://piazza.com/dtu.dk/%s%s/%s' % (semester().lower(), year(), course_number)
@@ -214,8 +260,8 @@ def class_information():
         'piazza': piazza, # deprecated.
         'course_number': course_number,
         'semester': semester(),
-         'reports_handout': [1,6],
-         'reports_handin': [6,11],
+         # 'reports_handout': [1,6], # Set in excel conf.
+         # 'reports_handin': [6, 11], # set in excel conf.
         'semester_id': semester_id(),
         'today': today(),
         'instructors': get_instructors(),

--- a/src/coursebox/core/projects.py
+++ b/src/coursebox/core/projects.py
+import re
+import tempfile
+import tika
 import os
 import shutil
 import openpyxl
@@ -5,7 +8,6 @@ import numpy as np
 import itertools
 import math
 import glob
-# import zipfile
 from tika import parser
 from openpyxl.worksheet.datavalidation import DataValidation
 from openpyxl.utils import get_column_letter
@@ -22,6 +24,8 @@ from jinjafy.plot.plot_helpers import get_colors
 import time
 from collections import defaultdict
 import zipfile
+import hashlib
+import pandas as pd


 def get_dirs(zf):
@@ -32,13 +36,11 @@ def get_dirs(zf):

 def fix_handins_fuckup(project_id=2):
    """ Handle the problem with multiple hand-ins in DTU learn. """
-    import zipfile
    paths = get_paths()
    from coursebox.core.info import class_information
    info = class_information()
    zf = paths['instructor_project_evaluations'] + f"/zip{project_id}.zip"

-
    tas = [i['shortname'] for i in info['instructors'] ]
    ta_links = {i['shortname']: i for i in info['instructors']}

@@ -51,7 +53,6 @@ def fix_handins_fuckup(project_id=2):
            ta_reports[r] = ta

    fls = get_dirs(zf)
-
    # fls = [f for f in zip.namelist() if not f.endswith("tml") and f.endswith("/")]
    d = defaultdict(lambda: [])
    for l in fls:
@@ -123,7 +124,6 @@ def handle_projects(verbose=False, gather_main_xlsx_file=True, plagiarism_check=
    zip1 = instructor_path + "/zip1.zip"
    zip2 = instructor_path + "/zip2.zip"
    zip3 = instructor_path + "/zip3.zip"
-
    zips = [None, zip1, zip2, zip3]

    for j,zf in enumerate(zips):
@@ -138,12 +138,12 @@ def handle_projects(verbose=False, gather_main_xlsx_file=True, plagiarism_check=
            continue
        else: # instructor files do not exist
            if j == 0:
-                copy_populate_from_template(info, sheet_number=j, zip_file=None)
+                copy_populate_from_template(paths, info, sheet_number=j, zip_file=None)

            elif os.path.exists(zf):
                # make a copy of report template and populate it with groups obtained from previous report evaluation.
                # all_groups = get_all_reports_from_collected_xlsx_file()
-                copy_populate_from_template(info, sheet_number=j, zip_file=zf)
+                copy_populate_from_template(paths, info, sheet_number=j, zip_file=zf)
                # distribute_zip_content(info, sheet=j, zf_base=zf)
            else:
                print("When available, please move downloaded copy of all reports from campusnet to destination:")
@@ -228,13 +228,13 @@ def compute_error_files(info, paths):
                        es = err_label + f"> Report score is {g.get('score', 0)}. The report score has to be between 0 and 4; probably due to a too high value of 'Delta' in instructor sheet."
                        ERRORS[ins].append(es)

-                if repn >= 1 and not g['comments']:
+                if repn >= 1 and not g['comments'] and info['course_number'] != '02465':
                    es = err_label + "> Incomplete report evaluation (missing comments field)"
                    es += "Please fill out comments field in your excel sheet."
                    ERRORS[ins].append(es)


-                if repn >= 1 and not g['approver_comments']:
+                if repn >= 1 and not g['approver_comments']  and info['course_number'] != '02465':
                    es = err_label + "> Incomplete report evaluation (you are missing the approver comments field; can simply be set to 'ok')."
                    ERRORS.get(g['approver'], []).append(es)

@@ -300,10 +300,70 @@ def get_instructor_xlsx_files(info, sheet):
    return xlsx


-import hashlib
+def get_groups_from_learn_xslx_file(paths, sheet_number):
+    fname = f"{paths['instructor_project_evaluations']}/groups{sheet_number}.xlsx"
+    all_groups = []
+    if os.path.exists(fname):
+        # Reading from the groups{number}.xlsx group-id file exported from DTU learn. Note this file contains fuckups.
+        dg = defaultdict(list)
+        df = pd.read_excel(fname)
+        for uname, group_id in zip(df['Username'], df['Project groups']):
+            id = int(group_id.split(" ")[1])
+            if len(uname) == 7 and uname[0] == 's':
+                dg[id].append(uname)
+            else:
+                dg[id].append("DTU-LEARN-FUCKED-THIS-ID-UP-CHECK-ON-REPORT")

-def copy_populate_from_template(info, sheet_number,zip_file):
+        all_groups = [{'group_id': id, 'student_ids': students} for id, students in dg.items()]
+    return all_groups
+
+def search_projects(paths, sheet_number, patterns):
+    zip_files = [paths['instructor_project_evaluations'] + "/zip%d.zip" % sheet_number]
+    # print(zip_files)
+
+    all_groups = []
+    gps = defaultdict(list)
+    for zip_file in zip_files:
+        if os.path.exists(zip_file):
+            tmpdir = tempfile.TemporaryDirectory()
+            zipfile.ZipFile(zip_file).extractall(path=tmpdir.name)
+            pdfs = glob.glob(tmpdir.name + "/**/*.pdf", recursive=True)
+            for pdf in pdfs:
+                pdf_parsed = tika.parser.from_file(pdf)
+                id =int(os.path.dirname(pdf).split(" - ")[1].split(" ")[1])
+                students = re.findall('s\d\d\d\d\d\d', pdf_parsed['content'], flags=re.IGNORECASE)
+                gps[id] += students
+
+    for id, students in gps.items():
+        all_groups.append({'group_id': id, 'student_ids': list(set(students))})
+    return all_groups
+
+
+def unpack_zip_file_recursively(zip_file, destination_dir):
+    """
+    Unpack the zip_file (extension: .zip) to the given directory.
+
+    If the folders in the zip file contains other zip/files, these are unpacked recursively.
+    """
+    # Unpack zip file recursively and flatten it.
+    zipfile.ZipFile(zip_file).extractall(path=destination_dir)
+    ls = glob.glob(destination_dir + "/*")
+    for f in ls:
+        if os.path.isdir(f):
+            zipfiles = glob.glob(f + "/*.zip")
+            for zp in zipfiles:
+                print("Unpacking student zip file>", zp)
+                zipfile.ZipFile(zp).extractall(path=os.path.dirname(zp) + "/")
+
+
+def copy_populate_from_template(paths, info, sheet_number,zip_file):
+    # Try to load group ids from the project pdf's
+    all_groups = search_projects(paths, sheet_number, r"s\d{6}")
+    # all_groups = get_groups_from_learn_xslx_file(paths, sheet_number)
+    if len(all_groups) == 0:
        all_groups = projects_info.get_groups_from_report(repn=sheet_number-1) if sheet_number > 0 else []
+    # Hopefully this did the trick and we have the groups all grouped up.
+
    # set up which TA approve which TA
    if any( [i['language'] not in ["en", "any"] for i in info['instructors'] ]):
        print(info['instructors'])
@@ -337,10 +397,13 @@ def copy_populate_from_template(info, sheet_number,zip_file):
    n_groups_per_instructor = 24 + (sheet_number == 0) * 26

    if sheet_number > 0:
-        zfd = zip_file[:-4]
-        if not os.path.exists(zfd):
-            os.mkdir(zfd)
-        zipfile.ZipFile(zip_file).extractall(path=zfd)
+        # zfd = zip_file[:-4]
+        # if not os.path.exists(zfd):
+        #     os.mkdir(zfd)
+        zfd = tempfile.TemporaryDirectory().name
+        # zipfile.ZipFile(zip_file).extractall(path=tmpdir.name)
+
+        unpack_zip_file_recursively(zip_file, destination_dir=zfd)
        # get all report handins (i.e. directories)
        ls = [l for l in glob.glob(zfd + "/*") if l[-3:] not in ["txt", "tml"]]

@@ -431,8 +494,8 @@ def copy_populate_from_template(info, sheet_number,zip_file):
        corrector = all_tas[shortname]['approver']
        if sheet_number > 0:
            # Copy reports to directory (distribute amongst TAs)
-            b_dir = os.path.dirname(zip_file)
-            ins_dir = "%s/project_%i_%s/"%(b_dir, sheet_number, shortname)
+            # b_dir = os.path.dirname(zip_file)
+            ins_dir = "%s/project_%i_%s/"%(zfd, sheet_number, shortname)

            if not os.path.exists(ins_dir):
                os.mkdir(ins_dir)
@@ -440,7 +503,7 @@ def copy_populate_from_template(info, sheet_number,zip_file):
            for handin in all_tas[shortname]['handins']:
                shutil.move(handin['path'], ins_dir)

-            shutil.make_archive(ins_dir[:-1], 'zip', ins_dir)
+            shutil.make_archive(os.path.dirname(zip_file) +"/"+ os.path.basename(ins_dir[:-1]), 'zip', ins_dir)
            time.sleep(2)
            print("Removing tree of reports to clear up space...")
            shutil.rmtree(ins_dir)
@@ -471,10 +534,10 @@ def copy_populate_from_template(info, sheet_number,zip_file):
                        sheet.cell(STUDENT_ID_ROW+j, ccol+i).value = s
        wb.save(ifile)
        wb.close()
-    # clean up zip file directories
-    if sheet_number > 0:
-        zfd = zip_file[:-4]
-        shutil.rmtree(zfd)
+    # clean up zip file directories; since it is a tmp file, we don't have to.
+    # if sheet_number > 0:
+    #     zfd = zip_file[:-4]
+    #     shutil.rmtree(zfd)

 def write_dropdown_sumprod_sheet(sheet):
    ccol = 2

--- a/src/coursebox/core/projects_info.py
+++ b/src/coursebox/core/projects_info.py
@@ -3,6 +3,7 @@ import os
 import re
 import openpyxl
 import numpy as np
+from line_profiler_pycharm import profile

 INSTRUCTOR_ROW = 6
 INSTRUCTOR_CHECKER_ROW = 31
@@ -16,19 +17,6 @@ RANGE_MIN_COL = 5

 DELTA_ALLOWED_ROW = 111 # The range of possible delta-values. Should be in an empty (new) row at bottom.

-def get_all_reports_from_collected_xlsx_file_DEFUNCT():  # when is this used?
-    out = get_output_file()
-    wb = openpyxl.load_workbook(out)
-    all_reports = {}
-    for repn in range(3, -1, -1):
-        cls = []
-        for i in range(2, wb.worksheets[repn].max_column + 1):
-            cp = parse_column(wb.worksheets[repn], report_number=repn, column=i)
-            if not cp['student_ids']:
-                continue
-            cls.append(cp)
-        all_reports[repn] = cls
-    return all_reports

 def parse_column_student_ids(v):
    sn = []
@@ -42,7 +30,82 @@ def parse_column_student_ids(v):
            sn.append(g)
    return sn

+
+def parse_column_numpy(col, report_number, column):
+    """ Parse a column assuming it is defined as a numpy array.
+    This is the recommended method as it is much, much faster.
+    """
+    # ws = worksheet  # wb.worksheets[sheet]
+    sn = []
+    group_id = col[STUDENT_ID_ROW - 1-1] #).value
+
+    # col = ['' if col[0] is np.NAN else x for x in col]
+
+    for i in range(0, 3):
+        v = col[i + STUDENT_ID_ROW-1]#, column=column).value
+        sn += parse_column_student_ids(v)
+
+
+    instructor = col[INSTRUCTOR_ROW-1]#, column=column).value
+    approver = col[INSTRUCTOR_ROW+1-1]# , column=column).value
+
+    if instructor:
+        instructor = instructor.lower()
+    if approver:
+        approver = str(approver).lower()
+
+    content = None
+    comments = None
+    appr_comments = None
+    if report_number > 0 and sn:
+        N = 38
+        rarr = np.ndarray(shape=(N,1),dtype=np.object)
+        for j in range(N):
+
+            v = col[3 + STUDENT_ID_ROW+j-1]#, column=column).value
+            rarr[j,0] = v
+        content = rarr
+        comments = col[EVALUATION_ROW_END+5-1]# , column=column).value
+        appr_comments = col[EVALUATION_ROW_END+6-1]# , column=column).value
+
+    cgroup = {'column_j': column, 'student_ids': sn, 'instructor': instructor, "approver": approver, 'content': content,
+              "comments": comments, "approver_comments": appr_comments, 'missing_fields': [],
+              'group_id': group_id}
+
+    # Now, find errors... This involves first finding non-zero columns
+    if report_number > 0 and sn:
+        score = cgroup['content'][-3, 0]
+        cgroup['score'] = score
+        cgroup['pct'] = score2pct(score)
+
+        # if report_number == 3: # this obviously needs fixing for next semester.
+        #     raise Exception("No report number 3 anymore. ")
+        #     I = []
+        #     for i in range(42): # max number of evaluation fields (irrelevant)
+        #         v1 = col[WEIGHT_ROW_START+i-1, RANGE_MIN_COL-1]# ).value
+        #         v2 = col[WEIGHT_ROW_START+i-1, RANGE_MIN_COL+1-1]#).value
+        #         if (v1 == -1 and v2 == 1) or (v1 == 0 and v2 == 4):
+        #             I.append(i)
+        #         if v1 == -1 and v2 == 1:
+        #             # print("delta col")
+        #             break
+        #
+        #     for i in I:
+        #         w1 = worksheet.cell(row=WEIGHT_ROW_START + i, column=1).value
+        #         w3_ = worksheet.cell(row=INSTRUCTOR_ROW + i+2, column=1).value # should agree with label in w1
+        #         w2 = worksheet.cell(row=INSTRUCTOR_ROW + i+2, column=column).value
+        #         if w2 == None:
+        #             cgroup['missing_fields'].append( (i, w1) )
+        #             if report_number < 3:
+        #                 print("old report nr.")
+
+    return cgroup
+
+
+
 def parse_column(worksheet, report_number, column):
+    """ This is the old method. It is very slow. Use the numpy-version above.
+    """
    ws = worksheet  # wb.worksheets[sheet]
    sn = []
    group_id = ws.cell(row=STUDENT_ID_ROW - 1, column=column).value
@@ -54,7 +117,8 @@ def parse_column(worksheet, report_number, column):
    instructor = ws.cell(row=INSTRUCTOR_ROW, column=column).value
    approver = ws.cell(row=INSTRUCTOR_ROW+1, column=column).value

-    if instructor: instructor = instructor.lower()
+    if instructor:
+        instructor = instructor.lower()
    if approver:
        approver = str(approver).lower()

@@ -135,32 +199,47 @@ def get_groups_from_report(repn):
        cls.append(cp)
    return cls

+
+# @profile
 def populate_student_report_results(students):
    # take students (list-of-dicts in the info format) and assign them the results from the reports.
    out = get_output_file()
+    import time
+    t0 = time.time()
    print("> Loading student report scores from: %s"%out)
    if not os.path.exists(out):
        return students, []

    for k in students:
        students[k]['reports'] = {i: None for i in range(4)}
+    import pandas as pd

-    wb = openpyxl.load_workbook(out,data_only=True)
+    wb = openpyxl.load_workbook(out, data_only=True, read_only=True)
    # Perhaps find non-empty cols (i.e. those with content)
-
+    print("> time elapsed", time.time() - t0)
    maximal_groups = []
    maximal_groups_students = []

    for repn in range(3, -1, -1):
        cls = []
-        for i in range(2, wb.worksheets[repn].max_column + 1):
-            cp = parse_column(wb.worksheets[repn], report_number=repn, column=i)
+        sheet = pd.read_excel(out, sheet_name=repn, index_col=None, header=None)
+        sheet = sheet.fillna('')
+        sheet = sheet.to_numpy()
+        # to_numpy()
+        for i in range(1,sheet.shape[1]):
+
+            # for i in range(2, wb.worksheets[repn].max_column + 1):
+            # print(i, wb.worksheets[repn].max_column)
+            # s = pd.read_excel(out, sheet_name=1)
+            cp = parse_column_numpy(sheet[:,i], report_number=repn, column=i)
+
+
+            # cp = parse_column(wb.worksheets[repn], report_number=repn, column=i)
            if not cp['student_ids']:
-                continue
+                break
            cls.append(cp)

        for g in cls:
-
            for sid in g['student_ids']:
                student = students.get(sid, None)
                if student is None:
@@ -172,5 +251,5 @@ def populate_student_report_results(students):
                    if sid not in maximal_groups_students:
                        maximal_groups.append(g)
                        maximal_groups_students += g['student_ids']
-
+    print("> time elapsed", time.time() -t0)
    return students, maximal_groups
\ No newline at end of file
--- a/src/coursebox/material/__pycache__/homepage_lectures_exercises.cpython-38.pyc
+++ b/src/coursebox/material/__pycache__/homepage_lectures_exercises.cpython-38.pyc
--- a/src/coursebox/material/homepage_lectures_exercises.py
+++ b/src/coursebox/material/homepage_lectures_exercises.py
@@ -3,6 +3,9 @@ import shutil, os, glob
 from datetime import datetime, timedelta
 import calendar
 import pickle
+import time
+from line_profiler_pycharm import profile
+from coursebox.thtools_base import partition_list

 import slider
 from jinjafy import jinjafy_comment
@@ -16,6 +19,7 @@ from coursebox.core.info import class_information
 from coursebox.material.lecture_questions import lecture_question_compiler
 from slider import latexmk
 import coursebox
+# from line_profiler_pycharm import profile

 def get_feedback_groups():
    paths = get_paths()
@@ -47,12 +51,13 @@ def get_feedback_groups():
    reduced_groups = [rg for rg in reduced_groups if len(rg)>0]
    # groups are now partitioned.
    if len(remaining_lectures) > 0:
-        fbgs = coursebox.thtools_base.partition_list(reduced_groups, len(remaining_lectures))
+        fbgs = partition_list(reduced_groups, len(remaining_lectures))
        for gg in fbgs:
            for g in gg:
                already_used = already_used + g

-        lst = thtools.thtools_base.partition_list([s for s in all_students if s not in already_used], len(remaining_lectures))
+
+        lst = partition_list([s for s in all_students if s not in already_used], len(remaining_lectures))
        for i in range(len(remaining_lectures)):
            dg = []
            for g in fbgs[i]: dg += g  # flatten the list
@@ -217,7 +222,49 @@ def compile_simple_files(paths, info, template_file_list, verbose=False):
        jinjafy_template(data=d2, file_in=fname, file_out=tex_out, filters=get_filters(), template_searchpath=paths['instructor'])
        latexmk(tex_out, pdf_out= paths['pdf_out'] + "/" + os.path.basename(tex_out)[:-4]+".pdf")

-def fix_shared(paths, output_dir, pdf2png=False,dosvg=True,verbose=False, compile_templates=True):
+# rec_fix_shared(shared_base=paths['shared'], output_dir=output_dir)
+import time
+# import dirsync
+# dirsync.sync(paths['shared'], output_dir, 'diff')
+
+
+# Do smarter fixin'
+from pathlib import Path
+
+from jinjafy.cache.simplecache import hash_file_
+
+@profile
+def get_hash_from_base(base):
+    if not os.path.exists(base + "/sharedcache.pkl"):
+        source = {}
+    else:
+        with open(base + "/sharedcache.pkl", 'rb') as f:
+            source = pickle.load(f)
+
+    actual_files = {}
+    for f in glob.glob(base + "/**", recursive=True):
+        if os.path.isdir(f):
+            continue
+        if f.endswith("sharedcache.pkl"):
+            continue
+        rel = os.path.relpath(f, base)
+
+        # d = dict(mtime=os.path.getmtime(f))
+        actual_files[rel] = dict(mtime=os.path.getmtime(f), hash=-1, modified=False)
+
+        if rel not in source or (actual_files[rel]['mtime'] != source[rel].get('mtime', -1)): # It has been modified, update hash
+            # print(rel, time.ctime(actual_files[rel]['mtime']), time.ctime(source[rel].get('mtime', -1)))
+            new_hash = hash_file_(f)
+            # actual_files[rel] = {}
+            actual_files[rel]['modified'] = new_hash != source.get(rel, {}).get('hash', -1)
+            actual_files[rel]['hash'] = new_hash
+        else:
+            actual_files[rel]['hash'] = source[rel]['hash']
+    return actual_files
+
+
+@profile
+def fix_shared(paths, output_dir, pdf2png=False,dosvg=True,verbose=False, compile_templates=True,shallow=True):
    '''
    Copy shared files into lecture directories
    '''
@@ -225,46 +272,171 @@ def fix_shared(paths, output_dir, pdf2png=False,dosvg=True,verbose=False, compil
    from jinjafy.cache import cache_contains_file, cache_update_file
    from slider.convert import svg2pdf, pdfcrop
    from slider import convert
+    import filecmp

-    def rec_fix_shared(shared_base, output_dir):
-        if dosvg:
-            for svg in glob.glob(shared_base+"/*.svg"):
-                if not cache_contains_file(cache_base, svg):
-                    if verbose:
-                        print("converting to pdf", svg)
-                    svg2pdf(svg,crop=True, text_to_path=True)
-                    cache_update_file(cache_base, svg)
-        files = glob.glob(shared_base+"/*")
-        for f in files:
-            if f.endswith("cache.pkl"):
-                continue
-            # check if template
-            if "templates" in f and f.endswith("_partial.tex"):
+    t0 = time.time()
+    shared_base = paths['shared']
+    output_dir = output_dir
+
+    import glob
+    # def get_cache_from_dir(shared_base):
+    # print("Beginning file cache..")
+
+
+    source = get_hash_from_base(shared_base)
+    target = get_hash_from_base(output_dir)
+
+    # update_source_cache = False
+    source_extra = {}
+    for rel in source:
+        if rel.endswith(".svg") and source[rel]['modified']:
+            pdf_file = svg2pdf(shared_base + "/"+rel, crop=True, text_to_path=True, verbose=True)
+            rel = os.path.relpath(pdf_file, shared_base)
+            source_extra[rel] = dict(mtime=os.path.getmtime(pdf_file), hash=hash_file_(pdf_file), modified=True)
+
+    for k, v in source_extra.items():
+        source[k] = v
+
+
+            # update_source_cache = True
+    # Perform sync here.
+    for rel in source:
+        if rel.endswith("_partial.tex"):
            continue

-            if os.path.isdir(f):
-                od2 = output_dir + "/" + os.path.basename(f)
-                if not os.path.exists(od2):
-                    os.mkdir(od2)
-                rec_fix_shared(f, od2)
-            else:
-                of = output_dir + "/" + os.path.basename(f)
-                if not cache_contains_file(cache_base, f) or not os.path.exists(of):
-                    print(f"> {f} -> {of}")
-                    shutil.copy(f, of)
-                    if f.endswith(".pdf") and pdf2png:
+        if rel not in target or target[rel]['hash'] != source[rel]['hash']:
+            print(" -> ", output_dir + "/" + rel)
+            shutil.copy(shared_base +"/" + rel, output_dir + "/" + rel)
+            target[rel] = source[rel].copy()
+            target[rel]['modified'] = True
+            target[rel]['mtime'] = os.path.getmtime(output_dir + "/" + rel)
+
+    if pdf2png:
+        for rel in target:
+            if rel.endswith(".pdf") and target[rel]['modified']:
+                # print("pdf2png: ")
+                png = convert.pdf2png(output_dir + "/" + rel, verbose=True)
+                target[rel]['modified'] = False
+                target[rel]['hash'] = hash_file_(output_dir + "/" + rel)
+                target[rel]['mtime'] = os.path.getmtime(output_dir + "/" + rel)
+
+    # Save the cache.

-                        if verbose:
-                            print(" converting to png", f)
-                        convert.pdf2png(of)
-                    cache_update_file(cache_base, f)
+    with open(shared_base + "/sharedcache.pkl", 'wb') as f:
+        pickle.dump(source, f)

-            if verbose:
-                print(" done!")
+    with open(output_dir + "/sharedcache.pkl", 'wb') as f:
+        pickle.dump(target, f)

+    print("fix_shared()", time.time() - t0)
+
+    #
+    # if pdf2png:
+    #     if f.endswith(".pdf") and pdf2png:
+    #         if verbose:
+    #             print("converting to png", f)
+    #         convert.pdf2png(of)
+    #
+    #     for f in source:
+    #         if f not in target:
+    #             print(f)
+    #         else:
+    #             if source[f]['hash'] != target[f]['hash']:
+    #                 print(f, f)
+    #
+    #
+    #
+    # a = 234
+    # # if rel not in source:
+    #
+    #     #     source[rel] = dict(mtime=os.path.getmtime(f), hash=hash_file_(f))
+    #     #
+    #
+    #
+    # # Everything has a hash/mtime that is up to date. Now look at target dir
+    #
+    # get_cache_from_dir(output_dir)
+    #
+    # # Get the corresponding output at destination:
+    #
+    #
+    #
+    #
+    #
+    #
+    # for path in Path(shared_base).rglob('*'):
+    #     print(path)
+    # a = 234
+    # def rec_fix_shared(shared_base, output_dir):
+    #     if dosvg:
+    #         for svg in glob.glob(shared_base+"/*.svg"):
+    #             # if not os.path.exists(shared_base + )
+    #             if not cache_contains_file(cache_base, svg):
+    #                 # if verbose:
+    #                 print("converting to pdf", svg)
+    #                 svg2pdf(svg,crop=True, text_to_path=True)
+    #                 cache_update_file(cache_base, svg)
+    #                 assert False
+    #
+    #     files = glob.glob(shared_base+"/*")
+    #     for f in files:
+    #         if f.endswith("cache.pkl"):
+    #             continue
+    #
+    #         if "templates" in f and f.endswith("_partial.tex"):
+    #             continue
+    #
+    #         if os.path.isdir(f):
+    #             od2 = output_dir + "/" + os.path.basename(f)
+    #             if not os.path.exists(od2):
+    #                 os.mkdir(od2)
+    #             rec_fix_shared(f, od2)
+    #         else:
+    #             of = output_dir + "/" + os.path.basename(f)
+    #             if not os.path.exists(of) or not filecmp.cmp(f, of,shallow=shallow):
+    #                 print(f"> fix_shared() -> {of}")
+    #                 shutil.copy(f, of)
+    #                 if f.endswith(".pdf") and pdf2png:
+    #                     if verbose:
+    #                         print("converting to png", f)
+    #                     convert.pdf2png(of)
+    #                 # cache_update_file(cache_base, f)
+    #
+    #         if verbose:
+    #             print(" done!")
+
+    # if pdf2png:
+    #     assert False
+
+
+
+    # get diff.
+
+    # directory_cmp = filecmp.dircmp(a=paths['shared'], b=output_dir)
+    # from filecmp import dircmp
+    # from filecmp import dircmp
+    # def print_diff_files(dcmp):
+    #     for name in dcmp.diff_files:
+    #         print("diff_file %s found in %s and %s" % (name, dcmp.left, dcmp.right))
+    #         print("")
+    #     for sub_dcmp in dcmp.subdirs.values():
+    #         print_diff_files(sub_dcmp)
+    #

+    # t0 = time.time()
+    # dcmp = dircmp(paths['shared'], output_dir)
+    # print_diff_files(dcmp)
+    # print("dircmp", time.time() - t0)
+    # directory_cmp.report()
+    # import time
+    # t0 = time.time()
+    # rec_fix_shared(shared_base=paths['shared'], output_dir=output_dir)
+    # import time
+    # # import dirsync
+    # # dirsync.sync(paths['shared'], output_dir, 'diff')
+    # print("mine", time.time() - t0)
+    a = 234

-    rec_fix_shared(shared_base=paths['shared'], output_dir=output_dir)

 def jinjafy_shared_templates_dir(paths, info):
    tpd = paths['shared'] + "/templates"
@@ -379,6 +551,7 @@ def mvfiles(source_dir, dest_dir):
        if (os.path.isfile(full_file_name)):
            shutil.copy(full_file_name, os.path.dirname(dest_dir))

+@profile
 def make_webpage(dosvg=True):
    cinfo = class_information()
    paths = get_paths()