diff --git a/exercises/02450Toolbox_Python/Data/body.mat b/exercises/02450Toolbox_Python/Data/body.mat deleted file mode 100644 index 813ae289c6d502ff0a9241334017cbc43ee29bab..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/body.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/courses.txt b/exercises/02450Toolbox_Python/Data/courses.txt deleted file mode 100644 index da22d2b8fb121647f4dbeda6edc3a3f4cae2f366..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/courses.txt +++ /dev/null @@ -1,6 +0,0 @@ -2,5,6,7,8 -1,2,3,6,7,8 -2,4,6,8 -3,6,7 -2,6,7 -2,3,6,7,8 diff --git a/exercises/02450Toolbox_Python/Data/digits.mat b/exercises/02450Toolbox_Python/Data/digits.mat deleted file mode 100644 index 434cf47348fd6709f94ee19381b428904d7d0530..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/digits.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/faithful.mat b/exercises/02450Toolbox_Python/Data/faithful.mat deleted file mode 100644 index 2a87a34731996624bffc3d0af8930ea59fff5d34..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/faithful.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/faithful.txt b/exercises/02450Toolbox_Python/Data/faithful.txt deleted file mode 100644 index d31bbd2a0c0dfeda95ca878b54fb502db0d9aa57..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/faithful.txt +++ /dev/null @@ -1,272 +0,0 @@ -3.600000 79.000000 -1.800000 54.000000 -3.333000 74.000000 -2.283000 62.000000 -4.533000 85.000000 -2.883000 55.000000 -4.700000 88.000000 -3.600000 85.000000 -1.950000 51.000000 -4.350000 85.000000 -1.833000 54.000000 -3.917000 84.000000 -4.200000 78.000000 -1.750000 47.000000 -4.700000 83.000000 -2.167000 52.000000 -1.750000 62.000000 -4.800000 84.000000 -1.600000 52.000000 -4.250000 79.000000 -1.800000 51.000000 -1.750000 47.000000 -3.450000 78.000000 -3.067000 69.000000 -4.533000 74.000000 -3.600000 83.000000 -1.967000 55.000000 -4.083000 76.000000 -3.850000 78.000000 -4.433000 79.000000 -4.300000 73.000000 -4.467000 77.000000 -3.367000 66.000000 -4.033000 80.000000 -3.833000 74.000000 -2.017000 52.000000 -1.867000 48.000000 -4.833000 80.000000 -1.833000 59.000000 -4.783000 90.000000 -4.350000 80.000000 -1.883000 58.000000 -4.567000 84.000000 -1.750000 58.000000 -4.533000 73.000000 -3.317000 83.000000 -3.833000 64.000000 -2.100000 53.000000 -4.633000 82.000000 -2.000000 59.000000 -4.800000 75.000000 -4.716000 90.000000 -1.833000 54.000000 -4.833000 80.000000 -1.733000 54.000000 -4.883000 83.000000 -3.717000 71.000000 -1.667000 64.000000 -4.567000 77.000000 -4.317000 81.000000 -2.233000 59.000000 -4.500000 84.000000 -1.750000 48.000000 -4.800000 82.000000 -1.817000 60.000000 -4.400000 92.000000 -4.167000 78.000000 -4.700000 78.000000 -2.067000 65.000000 -4.700000 73.000000 -4.033000 82.000000 -1.967000 56.000000 -4.500000 79.000000 -4.000000 71.000000 -1.983000 62.000000 -5.067000 76.000000 -2.017000 60.000000 -4.567000 78.000000 -3.883000 76.000000 -3.600000 83.000000 -4.133000 75.000000 -4.333000 82.000000 -4.100000 70.000000 -2.633000 65.000000 -4.067000 73.000000 -4.933000 88.000000 -3.950000 76.000000 -4.517000 80.000000 -2.167000 48.000000 -4.000000 86.000000 -2.200000 60.000000 -4.333000 90.000000 -1.867000 50.000000 -4.817000 78.000000 -1.833000 63.000000 -4.300000 72.000000 -4.667000 84.000000 -3.750000 75.000000 -1.867000 51.000000 -4.900000 82.000000 -2.483000 62.000000 -4.367000 88.000000 -2.100000 49.000000 -4.500000 83.000000 -4.050000 81.000000 -1.867000 47.000000 -4.700000 84.000000 -1.783000 52.000000 -4.850000 86.000000 -3.683000 81.000000 -4.733000 75.000000 -2.300000 59.000000 -4.900000 89.000000 -4.417000 79.000000 -1.700000 59.000000 -4.633000 81.000000 -2.317000 50.000000 -4.600000 85.000000 -1.817000 59.000000 -4.417000 87.000000 -2.617000 53.000000 -4.067000 69.000000 -4.250000 77.000000 -1.967000 56.000000 -4.600000 88.000000 -3.767000 81.000000 -1.917000 45.000000 -4.500000 82.000000 -2.267000 55.000000 -4.650000 90.000000 -1.867000 45.000000 -4.167000 83.000000 -2.800000 56.000000 -4.333000 89.000000 -1.833000 46.000000 -4.383000 82.000000 -1.883000 51.000000 -4.933000 86.000000 -2.033000 53.000000 -3.733000 79.000000 -4.233000 81.000000 -2.233000 60.000000 -4.533000 82.000000 -4.817000 77.000000 -4.333000 76.000000 -1.983000 59.000000 -4.633000 80.000000 -2.017000 49.000000 -5.100000 96.000000 -1.800000 53.000000 -5.033000 77.000000 -4.000000 77.000000 -2.400000 65.000000 -4.600000 81.000000 -3.567000 71.000000 -4.000000 70.000000 -4.500000 81.000000 -4.083000 93.000000 -1.800000 53.000000 -3.967000 89.000000 -2.200000 45.000000 -4.150000 86.000000 -2.000000 58.000000 -3.833000 78.000000 -3.500000 66.000000 -4.583000 76.000000 -2.367000 63.000000 -5.000000 88.000000 -1.933000 52.000000 -4.617000 93.000000 -1.917000 49.000000 -2.083000 57.000000 -4.583000 77.000000 -3.333000 68.000000 -4.167000 81.000000 -4.333000 81.000000 -4.500000 73.000000 -2.417000 50.000000 -4.000000 85.000000 -4.167000 74.000000 -1.883000 55.000000 -4.583000 77.000000 -4.250000 83.000000 -3.767000 83.000000 -2.033000 51.000000 -4.433000 78.000000 -4.083000 84.000000 -1.833000 46.000000 -4.417000 83.000000 -2.183000 55.000000 -4.800000 81.000000 -1.833000 57.000000 -4.800000 76.000000 -4.100000 84.000000 -3.966000 77.000000 -4.233000 81.000000 -3.500000 87.000000 -4.366000 77.000000 -2.250000 51.000000 -4.667000 78.000000 -2.100000 60.000000 -4.350000 82.000000 -4.133000 91.000000 -1.867000 53.000000 -4.600000 78.000000 -1.783000 46.000000 -4.367000 77.000000 -3.850000 84.000000 -1.933000 49.000000 -4.500000 83.000000 -2.383000 71.000000 -4.700000 80.000000 -1.867000 49.000000 -3.833000 75.000000 -3.417000 64.000000 -4.233000 76.000000 -2.400000 53.000000 -4.800000 94.000000 -2.000000 55.000000 -4.150000 76.000000 -1.867000 50.000000 -4.267000 82.000000 -1.750000 54.000000 -4.483000 75.000000 -4.000000 78.000000 -4.117000 79.000000 -4.083000 78.000000 -4.267000 78.000000 -3.917000 70.000000 -4.550000 79.000000 -4.083000 70.000000 -2.417000 54.000000 -4.183000 86.000000 -2.217000 50.000000 -4.450000 90.000000 -1.883000 54.000000 -1.850000 54.000000 -4.283000 77.000000 -3.950000 79.000000 -2.333000 64.000000 -4.150000 75.000000 -2.350000 47.000000 -4.933000 86.000000 -2.900000 63.000000 -4.583000 85.000000 -3.833000 82.000000 -2.083000 57.000000 -4.367000 82.000000 -2.133000 67.000000 -4.350000 74.000000 -2.200000 54.000000 -4.450000 83.000000 -3.567000 73.000000 -4.500000 73.000000 -4.150000 88.000000 -3.817000 80.000000 -3.917000 71.000000 -4.450000 83.000000 -2.000000 56.000000 -4.283000 79.000000 -4.767000 78.000000 -4.533000 84.000000 -1.850000 58.000000 -4.250000 83.000000 -1.983000 43.000000 -2.250000 60.000000 -4.750000 75.000000 -4.117000 81.000000 -2.150000 46.000000 -4.417000 90.000000 -1.817000 46.000000 -4.467000 74.000000 diff --git a/exercises/02450Toolbox_Python/Data/female.txt b/exercises/02450Toolbox_Python/Data/female.txt deleted file mode 100644 index 03a0f10d30373cd4d5d1ac1613028885c23dce77..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/female.txt +++ /dev/null @@ -1,4999 +0,0 @@ -Abagael -Abagail -Abbe -Abbey -Abbi -Abbie -Abby -Abigael -Abigail -Abigale -Abra -Acacia -Ada -Adah -Adaline -Adara -Addie -Addis -Adel -Adela -Adelaide -Adele -Adelice -Adelina -Adelind -Adeline -Adella -Adelle -Adena -Adey -Adi -Adiana -Adina -Adora -Adore -Adoree -Adorne -Adrea -Adria -Adriaens -Adrian -Adriana -Adriane -Adrianna -Adrianne -Adrien -Adriena -Adrienne -Aeriel -Aeriela -Aeriell -Ag -Agace -Agata -Agatha -Agathe -Aggi -Aggie -Aggy -Agna -Agnella -Agnes -Agnese -Agnesse -Agneta -Agnola -Agretha -Aida -Aidan -Aigneis -Aila -Aile -Ailee -Aileen -Ailene -Ailey -Aili -Ailina -Ailyn -Aime -Aimee -Aimil -Aina -Aindrea -Ainslee -Ainsley -Ainslie -Ajay -Alaine -Alameda -Alana -Alanah -Alane -Alanna -Alayne -Alberta -Albertina -Albertine -Albina -Alecia -Aleda -Aleece -Aleecia -Aleen -Alejandra -Alejandrina -Alena -Alene -Alessandra -Aleta -Alethea -Alex -Alexa -Alexandra -Alexandrina -Alexi -Alexia -Alexina -Alexine -Alexis -Alfie -Alfreda -Ali -Alia -Alica -Alice -Alicea -Alicia -Alida -Alidia -Alina -Aline -Alis -Alisa -Alisha -Alison -Alissa -Alisun -Alix -Aliza -Alla -Alleen -Allegra -Allene -Alli -Allianora -Allie -Allina -Allis -Allison -Allissa -Allsun -Ally -Allyce -Allyn -Allys -Allyson -Alma -Almeda -Almeria -Almeta -Almira -Almire -Aloise -Aloisia -Aloysia -Alpa -Alta -Althea -Alvera -Alvina -Alvinia -Alvira -Alyce -Alyda -Alys -Alysa -Alyse -Alysia -Alyson -Alyss -Alyssa -Amabel -Amabelle -Amalea -Amalee -Amaleta -Amalia -Amalie -Amalita -Amalle -Amanda -Amandi -Amandie -Amandy -Amara -Amargo -Amata -Amber -Amberly -Ambrosia -Ambur -Ame -Amelia -Amelie -Amelina -Ameline -Amelita -Ami -Amie -Amity -Ammamaria -Amy -Ana -Anabel -Anabella -Anabelle -Anais -Analiese -Analise -Anallese -Anallise -Anastasia -Anastasie -Anastassia -Anatola -Andee -Andi -Andie -Andra -Andrea -Andreana -Andree -Andrei -Andria -Andriana -Andriette -Andromache -Andromeda -Andy -Anestassia -Anet -Anett -Anetta -Anette -Ange -Angel -Angela -Angele -Angelia -Angelica -Angelika -Angelina -Angeline -Angelique -Angelita -Angelle -Angie -Angil -Angy -Ania -Anica -Anissa -Anita -Anitra -Anja -Anjanette -Anjela -Ann -Ann-Mari -Ann-Marie -Anna -Anna-Diana -Anna-Diane -Anna-Maria -Annabal -Annabel -Annabela -Annabell -Annabella -Annabelle -Annadiana -Annadiane -Annalee -Annalena -Annaliese -Annalisa -Annalise -Annalyse -Annamari -Annamaria -Annamarie -Anne -Anne-Corinne -Anne-Mar -Anne-Marie -Annecorinne -Anneliese -Annelise -Annemarie -Annetta -Annette -Anni -Annice -Annie -Annissa -Annmaria -Annmarie -Annnora -Annora -Anny -Anselma -Ansley -Anstice -Anthe -Anthea -Anthia -Antoinette -Antonella -Antonetta -Antonia -Antonie -Antonietta -Antonina -Anya -Aphrodite -Appolonia -April -Aprilette -Ara -Arabel -Arabela -Arabele -Arabella -Arabelle -Arda -Ardath -Ardeen -Ardelia -Ardelis -Ardella -Ardelle -Arden -Ardene -Ardenia -Ardine -Ardis -Ardith -Ardra -Ardyce -Ardys -Ardyth -Aretha -Ariadne -Ariana -Arianne -Aridatha -Ariel -Ariela -Ariella -Arielle -Arlana -Arlee -Arleen -Arlen -Arlena -Arlene -Arleta -Arlette -Arleyne -Arlie -Arliene -Arlina -Arlinda -Arline -Arly -Arlyn -Arlyne -Aryn -Ashely -Ashlee -Ashleigh -Ashlen -Ashley -Ashli -Ashlie -Ashly -Asia -Astra -Astrid -Astrix -Atalanta -Athena -Athene -Atlanta -Atlante -Auberta -Aubine -Aubree -Aubrette -Aubrey -Aubrie -Aubry -Audi -Audie -Audra -Audre -Audrey -Audrie -Audry -Audrye -Audy -Augusta -Auguste -Augustina -Augustine -Aura -Aurea -Aurel -Aurelea -Aurelia -Aurelie -Auria -Aurie -Aurilia -Aurlie -Auroora -Aurora -Aurore -Austin -Austina -Austine -Ava -Aveline -Averil -Averyl -Avie -Avis -Aviva -Avivah -Avril -Avrit -Ayn -Bab -Babara -Babette -Babita -Babs -Bambi -Bambie -Bamby -Barb -Barbabra -Barbara -Barbara-Anne -Barbaraanne -Barbe -Barbee -Barbette -Barbey -Barbi -Barbie -Barbra -Barby -Bari -Barrie -Barry -Basia -Bathsheba -Batsheva -Bea -Beatrice -Beatrisa -Beatrix -Beatriz -Beau -Bebe -Becca -Becka -Becki -Beckie -Becky -Bee -Beilul -Beitris -Bekki -Bel -Belia -Belicia -Belinda -Belita -Bell -Bella -Bellamy -Bellanca -Belle -Bellina -Belva -Belvia -Bendite -Benedetta -Benedicta -Benedikta -Benetta -Benita -Benni -Bennie -Benny -Benoite -Berenice -Beret -Berget -Berna -Bernadene -Bernadette -Bernadina -Bernadine -Bernardina -Bernardine -Bernelle -Bernete -Bernetta -Bernette -Berni -Bernice -Bernie -Bernita -Berny -Berri -Berrie -Berry -Bert -Berta -Berte -Bertha -Berthe -Berti -Bertie -Bertina -Bertine -Berty -Beryl -Beryle -Bess -Bessie -Bessy -Beth -Bethanne -Bethany -Bethena -Bethina -Betsey -Betsy -Betta -Bette -Bette-Ann -Betteann -Betteanne -Betti -Bettie -Bettina -Bettine -Betty -Bettye -Beulah -Bev -Beverie -Beverlee -Beverlie -Beverly -Bevvy -Bianca -Bianka -Biddy -Bidget -Bill -Billi -Billie -Billy -Binni -Binnie -Binny -Bird -Birdie -Birgit -Birgitta -Blair -Blaire -Blake -Blakelee -Blakeley -Blanca -Blanch -Blancha -Blanche -Blinni -Blinnie -Blinny -Bliss -Blisse -Blithe -Blondell -Blondelle -Blondie -Blondy -Blythe -Bo -Bobbette -Bobbi -Bobbie -Bobby -Bobette -Bobina -Bobine -Bobinette -Bonita -Bonnee -Bonni -Bonnie -Bonny -Brana -Brandais -Brande -Brandea -Brandi -Brandice -Brandie -Brandise -Brandy -Brea -Breanne -Brear -Bree -Breena -Bren -Brena -Brenda -Brenn -Brenna -Brett -Bria -Briana -Brianna -Brianne -Bride -Bridget -Bridgett -Bridgette -Bridie -Brier -Brietta -Brigid -Brigida -Brigit -Brigitta -Brigitte -Brina -Briney -Briny -Brit -Brita -Britaney -Britani -Briteny -Britney -Britni -Britt -Britta -Brittan -Brittany -Britte -Brittney -Brook -Brooke -Brooks -Brunella -Brunhilda -Brunhilde -Bryana -Bryn -Bryna -Brynn -Brynna -Brynne -Buffy -Bunni -Bunnie -Bunny -Burta -Cabrina -Cacilia -Cacilie -Caitlin -Caitrin -Cal -Calida -Calla -Calley -Calli -Callida -Callie -Cally -Calypso -Cam -Camala -Camel -Camella -Camellia -Cameo -Cami -Camila -Camile -Camilla -Camille -Cammi -Cammie -Cammy -Canada -Candace -Candi -Candice -Candida -Candide -Candie -Candis -Candra -Candy -Cappella -Caprice -Cara -Caralie -Caren -Carena -Caresa -Caressa -Caresse -Carey -Cari -Caria -Carie -Caril -Carilyn -Carin -Carina -Carine -Cariotta -Carissa -Carita -Caritta -Carla -Carlee -Carleen -Carlen -Carlena -Carlene -Carley -Carli -Carlie -Carlin -Carlina -Carline -Carlisle -Carlita -Carlota -Carlotta -Carly -Carlye -Carlyn -Carlynn -Carlynne -Carma -Carmel -Carmela -Carmelia -Carmelina -Carmelita -Carmella -Carmelle -Carmen -Carmina -Carmine -Carmita -Carmon -Caro -Carol -Carol-Jean -Carola -Carolan -Carolann -Carole -Carolee -Caroleen -Carolie -Carolin -Carolina -Caroline -Caroljean -Carolyn -Carolyne -Carolynn -Caron -Carree -Carri -Carrie -Carrissa -Carrol -Carroll -Carry -Cary -Caryl -Caryn -Casandra -Casey -Casi -Casia -Casie -Cass -Cassandra -Cassandre -Cassandry -Cassaundra -Cassey -Cassi -Cassie -Cassondra -Cassy -Cat -Catarina -Cate -Caterina -Catha -Catharina -Catharine -Cathe -Cathee -Catherin -Catherina -Catherine -Cathi -Cathie -Cathleen -Cathlene -Cathrin -Cathrine -Cathryn -Cathy -Cathyleen -Cati -Catie -Catina -Catlaina -Catlee -Catlin -Catrina -Catriona -Caty -Cayla -Cecelia -Cecil -Cecile -Ceciley -Cecilia -Cecilla -Cecily -Ceil -Cele -Celene -Celesta -Celeste -Celestia -Celestina -Celestine -Celestyn -Celestyna -Celia -Celie -Celina -Celinda -Celine -Celinka -Celisse -Celle -Cesya -Chad -Chanda -Chandal -Chandra -Channa -Chantal -Chantalle -Charil -Charin -Charis -Charissa -Charisse -Charita -Charity -Charla -Charlean -Charleen -Charlena -Charlene -Charline -Charlot -Charlott -Charlotta -Charlotte -Charmain -Charmaine -Charmane -Charmian -Charmine -Charmion -Charo -Charyl -Chastity -Chelsae -Chelsea -Chelsey -Chelsie -Chelsy -Cher -Chere -Cherey -Cheri -Cherianne -Cherice -Cherida -Cherie -Cherilyn -Cherilynn -Cherin -Cherise -Cherish -Cherlyn -Cherri -Cherrita -Cherry -Chery -Cherye -Cheryl -Cheslie -Chiarra -Chickie -Chicky -Chiquita -Chloe -Chloette -Chloris -Chris -Chriss -Chrissa -Chrissie -Chrissy -Christa -Christabel -Christabella -Christabelle -Christal -Christalle -Christan -Christean -Christel -Christen -Christi -Christian -Christiana -Christiane -Christie -Christin -Christina -Christine -Christy -Christyna -Chrysa -Chrysler -Chrystal -Chryste -Chrystel -Ciara -Cicely -Cicily -Ciel -Cilka -Cinda -Cindee -Cindelyn -Cinderella -Cindi -Cindie -Cindra -Cindy -Cinnamon -Cissie -Cissy -Clair -Claire -Clara -Clarabelle -Clare -Claresta -Clareta -Claretta -Clarette -Clarey -Clari -Claribel -Clarice -Clarie -Clarinda -Clarine -Clarisa -Clarissa -Clarisse -Clarita -Clary -Claude -Claudelle -Claudetta -Claudette -Claudia -Claudie -Claudina -Claudine -Clea -Clem -Clemence -Clementia -Clementina -Clementine -Clemmie -Clemmy -Cleo -Cleopatra -Clerissa -Cleva -Clio -Clo -Cloe -Cloris -Clotilda -Clovis -Codee -Codi -Codie -Cody -Coleen -Colene -Coletta -Colette -Colleen -Collete -Collette -Collie -Colline -Colly -Con -Concettina -Conchita -Concordia -Conney -Conni -Connie -Conny -Consolata -Constance -Constancia -Constancy -Constanta -Constantia -Constantina -Constantine -Consuela -Consuelo -Cookie -Cora -Corabel -Corabella -Corabelle -Coral -Coralie -Coraline -Coralyn -Cordelia -Cordelie -Cordey -Cordie -Cordula -Cordy -Coreen -Corella -Corena -Corenda -Corene -Coretta -Corette -Corey -Cori -Corie -Corilla -Corina -Corine -Corinna -Corinne -Coriss -Corissa -Corliss -Corly -Cornela -Cornelia -Cornelle -Cornie -Corny -Correna -Correy -Corri -Corrianne -Corrie -Corrina -Corrine -Corrinne -Corry -Cortney -Cory -Cosetta -Cosette -Courtenay -Courtney -Cresa -Cris -Crissie -Crissy -Crista -Cristabel -Cristal -Cristen -Cristi -Cristie -Cristin -Cristina -Cristine -Cristionna -Cristy -Crysta -Crystal -Crystie -Cyb -Cybal -Cybel -Cybelle -Cybil -Cybill -Cyndi -Cyndy -Cynthea -Cynthia -Cynthie -Cynthy -Dacey -Dacia -Dacie -Dacy -Dael -Daffi -Daffie -Daffy -Dafna -Dagmar -Dahlia -Daile -Daisey -Daisi -Daisie -Daisy -Dale -Dalenna -Dalia -Dalila -Dallas -Daloris -Damara -Damaris -Damita -Dana -Danell -Danella -Danelle -Danette -Dani -Dania -Danica -Danice -Daniel -Daniela -Daniele -Daniella -Danielle -Danika -Danila -Danit -Danita -Danna -Danni -Dannie -Danny -Dannye -Danya -Danyelle -Danyette -Daphene -Daphna -Daphne -Dara -Darb -Darbie -Darby -Darcee -Darcey -Darci -Darcie -Darcy -Darda -Dareen -Darell -Darelle -Dari -Daria -Darice -Darla -Darleen -Darlene -Darline -Darryl -Darsey -Darsie -Darya -Daryl -Daryn -Dasha -Dasi -Dasie -Dasya -Datha -Daune -Daveen -Daveta -Davida -Davina -Davine -Davita -Dawn -Dawna -Dayle -Dayna -Dea -Deana -Deane -Deanna -Deanne -Deb -Debbi -Debbie -Debbra -Debby -Debee -Debera -Debi -Debor -Debora -Deborah -Debra -Dede -Dedie -Dedra -Dee -Dee Dee -Deeann -Deeanne -Deedee -Deena -Deerdre -Dehlia -Deidre -Deina -Deirdre -Del -Dela -Delaney -Delcina -Delcine -Delia -Delila -Delilah -Delinda -Dell -Della -Delly -Delora -Delores -Deloria -Deloris -Delphina -Delphine -Delphinia -Demeter -Demetra -Demetria -Demetris -Dena -Deni -Denice -Denise -Denna -Denni -Dennie -Denny -Deny -Denys -Denyse -Deonne -Desaree -Desdemona -Desirae -Desiree -Desiri -Deva -Devan -Devi -Devin -Devina -Devinne -Devon -Devondra -Devonna -Devonne -Devora -Dew -Di -Diahann -Diamond -Dian -Diana -Diandra -Diane -Diane-Marie -Dianemarie -Diann -Dianna -Dianne -Diannne -Didi -Dido -Diena -Dierdre -Dina -Dinah -Dinnie -Dinny -Dion -Dione -Dionis -Dionne -Dita -Dix -Dixie -Dode -Dodi -Dodie -Dody -Doe -Doll -Dolley -Dolli -Dollie -Dolly -Dolora -Dolores -Dolorita -Doloritas -Dominica -Dominique -Dona -Donella -Donelle -Donetta -Donia -Donica -Donielle -Donna -Donnajean -Donnamarie -Donni -Donnie -Donny -Dora -Doralia -Doralin -Doralyn -Doralynn -Doralynne -Dorcas -Dore -Doreen -Dorelia -Dorella -Dorelle -Dorena -Dorene -Doretta -Dorette -Dorey -Dori -Doria -Dorian -Dorice -Dorie -Dorine -Doris -Dorisa -Dorise -Dorit -Dorita -Doro -Dorolice -Dorolisa -Dorotea -Doroteya -Dorothea -Dorothee -Dorothy -Dorree -Dorri -Dorrie -Dorris -Dorry -Dorthea -Dorthy -Dory -Dosi -Dot -Doti -Dotti -Dottie -Dotty -Dove -Drea -Drew -Dulce -Dulcea -Dulci -Dulcia -Dulciana -Dulcie -Dulcine -Dulcinea -Dulcy -Dulsea -Dusty -Dyan -Dyana -Dyane -Dyann -Dyanna -Dyanne -Dyna -Dynah -Eada -Eadie -Eadith -Ealasaid -Eartha -Easter -Eba -Ebba -Ebonee -Ebony -Eda -Eddi -Eddie -Eddy -Ede -Edee -Edeline -Eden -Edi -Edie -Edin -Edita -Edith -Editha -Edithe -Ediva -Edna -Edwina -Edy -Edyth -Edythe -Effie -Eileen -Eilis -Eimile -Eirena -Ekaterina -Elaina -Elaine -Elana -Elane -Elayne -Elberta -Elbertina -Elbertine -Eleanor -Eleanora -Eleanore -Electra -Elena -Elene -Eleni -Elenore -Eleonora -Eleonore -Elfie -Elfreda -Elfrida -Elfrieda -Elga -Elianora -Elianore -Elicia -Elie -Elinor -Elinore -Elisa -Elisabet -Elisabeth -Elisabetta -Elise -Elisha -Elissa -Elita -Eliza -Elizabet -Elizabeth -Elka -Elke -Ella -Elladine -Elle -Ellen -Ellene -Ellette -Elli -Ellie -Ellissa -Elly -Ellyn -Ellynn -Elmira -Elna -Elnora -Elnore -Eloisa -Eloise -Elonore -Elora -Elsa -Elsbeth -Else -Elsey -Elsi -Elsie -Elsinore -Elspeth -Elsy -Elva -Elvera -Elvina -Elvira -Elwina -Elwira -Elyn -Elyse -Elysee -Elysha -Elysia -Elyssa -Em -Ema -Emalee -Emalia -Emanuela -Emelda -Emelia -Emelina -Emeline -Emelita -Emelyne -Emera -Emilee -Emili -Emilia -Emilie -Emiline -Emily -Emlyn -Emlynn -Emlynne -Emma -Emmalee -Emmaline -Emmalyn -Emmalynn -Emmalynne -Emmeline -Emmey -Emmi -Emmie -Emmy -Emmye -Emogene -Emyle -Emylee -Endora -Engracia -Enid -Enrica -Enrichetta -Enrika -Enriqueta -Enya -Eolanda -Eolande -Eran -Erda -Erena -Erica -Ericha -Ericka -Erika -Erin -Erina -Erinn -Erinna -Erma -Ermengarde -Ermentrude -Ermina -Erminia -Erminie -Erna -Ernaline -Ernesta -Ernestine -Ertha -Eryn -Esma -Esmaria -Esme -Esmeralda -Esmerelda -Essa -Essie -Essy -Esta -Estel -Estele -Estell -Estella -Estelle -Ester -Esther -Estrella -Estrellita -Ethel -Ethelda -Ethelin -Ethelind -Etheline -Ethelyn -Ethyl -Etta -Etti -Ettie -Etty -Eudora -Eugenia -Eugenie -Eugine -Eula -Eulalie -Eunice -Euphemia -Eustacia -Eva -Evaleen -Evangelia -Evangelin -Evangelina -Evangeline -Evania -Evanne -Eve -Eveleen -Evelina -Eveline -Evelyn -Evette -Evey -Evie -Evita -Evonne -Evvie -Evvy -Evy -Eyde -Eydie -Fabrianne -Fabrice -Fae -Faina -Faith -Fallon -Fan -Fanchette -Fanchon -Fancie -Fancy -Fanechka -Fania -Fanni -Fannie -Fanny -Fanya -Fara -Farah -Farand -Farica -Farra -Farrah -Farrand -Fatima -Faun -Faunie -Faustina -Faustine -Fawn -Fawna -Fawne -Fawnia -Fay -Faydra -Faye -Fayette -Fayina -Fayre -Fayth -Faythe -Federica -Fedora -Felecia -Felicdad -Felice -Felicia -Felicity -Felicle -Felipa -Felisha -Felita -Feliza -Fenelia -Feodora -Ferdinanda -Ferdinande -Fern -Fernanda -Fernande -Fernandina -Ferne -Fey -Fiann -Fianna -Fidela -Fidelia -Fidelity -Fifi -Fifine -Filia -Filide -Filippa -Fina -Fiona -Fionna -Fionnula -Fiorenze -Fleur -Fleurette -Flo -Flor -Flora -Florance -Flore -Florella -Florence -Florencia -Florentia -Florenza -Florette -Flori -Floria -Florice -Florida -Florie -Florina -Florinda -Floris -Florri -Florrie -Florry -Flory -Flossi -Flossie -Flossy -Flower -Fortuna -Fortune -Fran -France -Francene -Frances -Francesca -Francesmary -Francine -Francis -Francisca -Franciska -Francoise -Francyne -Frank -Frankie -Franky -Franni -Frannie -Franny -Frayda -Fred -Freda -Freddi -Freddie -Freddy -Fredelia -Frederica -Fredericka -Fredi -Fredia -Fredra -Fredrika -Freida -Frieda -Friederike -Fulvia -Gabbey -Gabbi -Gabbie -Gabey -Gabi -Gabie -Gabriel -Gabriela -Gabriell -Gabriella -Gabrielle -Gabriellia -Gabrila -Gaby -Gae -Gael -Gail -Gale -Galina -Garland -Garnet -Garnette -Gates -Gavra -Gavrielle -Gay -Gayla -Gayle -Gayleen -Gaylene -Gaynor -Geeta -Gelya -Gen -Gena -Gene -Geneva -Genevieve -Genevra -Genia -Genna -Genni -Gennie -Gennifer -Genny -Genovera -Genvieve -George -Georgeanna -Georgeanne -Georgena -Georgeta -Georgetta -Georgette -Georgia -Georgiamay -Georgiana -Georgianna -Georgianne -Georgie -Georgina -Georgine -Gera -Geralda -Geraldina -Geraldine -Gerda -Gerhardine -Geri -Gerianna -Gerianne -Gerladina -Germain -Germaine -Germana -Gerri -Gerrie -Gerrilee -Gerry -Gert -Gerta -Gerti -Gertie -Gertrud -Gertruda -Gertrude -Gertrudis -Gerty -Giacinta -Giana -Gianina -Gianna -Gigi -Gilberta -Gilberte -Gilbertina -Gilbertine -Gilda -Gill -Gillan -Gilli -Gillian -Gillie -Gilligan -Gilly -Gina -Ginelle -Ginevra -Ginger -Ginni -Ginnie -Ginnifer -Ginny -Giorgia -Giovanna -Gipsy -Giralda -Gisela -Gisele -Gisella -Giselle -Gita -Gizela -Glad -Gladi -Gladis -Gladys -Gleda -Glen -Glenda -Glenine -Glenn -Glenna -Glennie -Glennis -Glori -Gloria -Gloriana -Gloriane -Glorianna -Glory -Glyn -Glynda -Glynis -Glynnis -Godiva -Golda -Goldarina -Goldi -Goldia -Goldie -Goldina -Goldy -Grace -Gracia -Gracie -Grata -Gratia -Gratiana -Gray -Grayce -Grazia -Gredel -Greer -Greta -Gretal -Gretchen -Grete -Gretel -Grethel -Gretna -Gretta -Grier -Griselda -Grissel -Guendolen -Guenevere -Guenna -Guglielma -Gui -Guillema -Guillemette -Guinevere -Guinna -Gunilla -Gunvor -Gus -Gusella -Gussi -Gussie -Gussy -Gusta -Gusti -Gustie -Gusty -Gwen -Gwendolen -Gwendolin -Gwendolyn -Gweneth -Gwenette -Gwenn -Gwenneth -Gwenni -Gwennie -Gwenny -Gwenora -Gwenore -Gwyn -Gwyneth -Gwynne -Gypsy -Hadria -Hailee -Haily -Haleigh -Halette -Haley -Hali -Halie -Halimeda -Halley -Halli -Hallie -Hally -Hana -Hanna -Hannah -Hanni -Hannibal -Hannie -Hannis -Hanny -Happy -Harlene -Harley -Harli -Harlie -Harmonia -Harmonie -Harmony -Harri -Harrie -Harriet -Harriett -Harrietta -Harriette -Harriot -Harriott -Hatti -Hattie -Hatty -Havivah -Hayley -Hazel -Heath -Heather -Heda -Hedda -Heddi -Heddie -Hedi -Hedvig -Hedwig -Hedy -Heida -Heide -Heidi -Heidie -Helaina -Helaine -Helen -Helen-Elizabeth -Helena -Helene -Helga -Helge -Helise -Hellene -Helli -Heloise -Helsa -Helyn -Hendrika -Henka -Henrie -Henrieta -Henrietta -Henriette -Henryetta -Hephzibah -Hermia -Hermina -Hermine -Herminia -Hermione -Herta -Hertha -Hester -Hesther -Hestia -Hetti -Hettie -Hetty -Hilarie -Hilary -Hilda -Hildagard -Hildagarde -Hilde -Hildegaard -Hildegarde -Hildy -Hillary -Hilliary -Hinda -Holley -Holli -Hollie -Holly -Holly-Anne -Hollyanne -Honey -Honor -Honoria -Hope -Horatia -Hortense -Hortensia -Hulda -Hyacinth -Hyacintha -Hyacinthe -Hyacinthia -Hyacinthie -Hynda -Ianthe -Ibbie -Ibby -Ida -Idalia -Idalina -Idaline -Idell -Idelle -Idette -Ike -Ikey -Ilana -Ileana -Ileane -Ilene -Ilise -Ilka -Illa -Ilona -Ilsa -Ilse -Ilysa -Ilyse -Ilyssa -Imelda -Imogen -Imogene -Imojean -Ina -Inci -Indira -Ines -Inesita -Inessa -Inez -Inga -Ingaberg -Ingaborg -Inge -Ingeberg -Ingeborg -Inger -Ingrid -Ingunna -Inna -Ioana -Iolande -Iolanthe -Iona -Iormina -Ira -Irena -Irene -Irina -Iris -Irita -Irma -Isa -Isabeau -Isabel -Isabelita -Isabella -Isabelle -Isador -Isadora -Isadore -Isahella -Iseabal -Isidora -Isis -Isobel -Issi -Issie -Issy -Ivett -Ivette -Ivie -Ivonne -Ivory -Ivy -Izabel -Izzi -Jacenta -Jacinda -Jacinta -Jacintha -Jacinthe -Jackelyn -Jacki -Jackie -Jacklin -Jacklyn -Jackquelin -Jackqueline -Jacky -Jaclin -Jaclyn -Jacquelin -Jacqueline -Jacquelyn -Jacquelynn -Jacquenetta -Jacquenette -Jacquetta -Jacquette -Jacqui -Jacquie -Jacynth -Jada -Jade -Jaime -Jaimie -Jaine -Jaleh -Jami -Jamie -Jamima -Jammie -Jan -Jana -Janaya -Janaye -Jandy -Jane -Janean -Janeczka -Janeen -Janel -Janela -Janella -Janelle -Janene -Janenna -Janessa -Janet -Janeta -Janetta -Janette -Janeva -Janey -Jania -Janice -Janie -Janifer -Janina -Janine -Janis -Janith -Janka -Janna -Jannel -Jannelle -Janot -Jany -Jaquelin -Jaquelyn -Jaquenetta -Jaquenette -Jaquith -Jasmin -Jasmina -Jasmine -Jayme -Jaymee -Jayne -Jaynell -Jazmin -Jean -Jeana -Jeane -Jeanelle -Jeanette -Jeanie -Jeanine -Jeanna -Jeanne -Jeannette -Jeannie -Jeannine -Jehanna -Jelene -Jemie -Jemima -Jemimah -Jemmie -Jemmy -Jen -Jena -Jenda -Jenelle -Jenette -Jeni -Jenica -Jeniece -Jenifer -Jeniffer -Jenilee -Jenine -Jenn -Jenna -Jennee -Jennette -Jenni -Jennica -Jennie -Jennifer -Jennilee -Jennine -Jenny -Jeraldine -Jeralee -Jere -Jeri -Jermaine -Jerrie -Jerrilee -Jerrilyn -Jerrine -Jerry -Jerrylee -Jess -Jessa -Jessalin -Jessalyn -Jessamine -Jessamyn -Jesse -Jesselyn -Jessi -Jessica -Jessie -Jessika -Jessy -Jewel -Jewell -Jewelle -Jill -Jillana -Jillane -Jillayne -Jilleen -Jillene -Jilli -Jillian -Jillie -Jilly -Jinny -Jo -Jo Ann -Jo-Ann -JoAnn -Jo-Anne -JoAnne -Joan -Joana -Joane -Joanie -Joann -Joanna -Joanne -Joannes -Jobey -Jobi -Jobie -Jobina -Joby -Jobye -Jobyna -Jocelin -Joceline -Jocelyn -Jocelyne -Jodee -Jodi -Jodie -Jody -Joela -Joelie -Joell -Joella -Joelle -Joellen -Joelly -Joellyn -Joelynn -Joete -Joey -Johanna -Johannah -Johnette -Johnna -Joice -Jojo -Jolee -Joleen -Jolene -Joletta -Joli -Jolie -Joline -Joly -Jolyn -Jolynn -Jonell -Joni -Jonie -Jonis -Jordain -Jordan -Jordana -Jordanna -Jorey -Jori -Jorie -Jorrie -Jorry -Joscelin -Josee -Josefa -Josefina -Joselyn -Josepha -Josephina -Josephine -Josey -Josi -Josie -Joslyn -Josselyn -Josy -Jourdan -Joy -Joya -Joyan -Joyann -Joyce -Joycelin -Joye -Joyous -Juana -Juanita -Jude -Judi -Judie -Judith -Juditha -Judy -Judye -Julee -Juli -Julia -Juliana -Juliane -Juliann -Julianna -Julianne -Julie -Julienne -Juliet -Julieta -Julietta -Juliette -Julina -Juline -Julissa -Julita -June -Junette -Junia -Junie -Junina -Justin -Justina -Justine -Jyoti -Kaari -Kacey -Kacie -Kacy -Kai -Kaia -Kaila -Kaile -Kailey -Kaitlin -Kaitlyn -Kaitlynn -Kaja -Kakalina -Kala -Kaleena -Kali -Kalie -Kalila -Kalina -Kalinda -Kalindi -Kalli -Kally -Kameko -Kamila -Kamilah -Kamillah -Kandace -Kandy -Kania -Kanya -Kara -Kara-Lynn -Karalee -Karalynn -Kare -Karee -Karel -Karen -Karena -Kari -Karia -Karie -Karil -Karilynn -Karin -Karina -Karine -Kariotta -Karisa -Karissa -Karita -Karla -Karlee -Karleen -Karlen -Karlene -Karlie -Karlotta -Karlotte -Karly -Karlyn -Karmen -Karna -Karol -Karola -Karole -Karolina -Karoline -Karoly -Karon -Karrah -Karrie -Karry -Kary -Karyl -Karylin -Karyn -Kasey -Kass -Kassandra -Kassey -Kassi -Kassia -Kassie -Kaster -Kat -Kata -Katalin -Kate -Katee -Katerina -Katerine -Katey -Kath -Katha -Katharina -Katharine -Katharyn -Kathe -Katheleen -Katherina -Katherine -Katheryn -Kathi -Kathie -Kathleen -Kathlene -Kathlin -Kathrine -Kathryn -Kathryne -Kathy -Kathye -Kati -Katie -Katina -Katine -Katinka -Katleen -Katlin -Katrina -Katrine -Katrinka -Katti -Kattie -Katuscha -Katusha -Katy -Katya -Kay -Kaycee -Kaye -Kayla -Kayle -Kaylee -Kayley -Kaylil -Kaylyn -Kee -Keeley -Keelia -Keely -Kelcey -Kelci -Kelcie -Kelcy -Kelila -Kellen -Kelley -Kelli -Kellia -Kellie -Kellina -Kellsie -Kelly -Kellyann -Kelsey -Kelsi -Kelsy -Kendra -Kendre -Kenna -Keren -Keri -Keriann -Kerianne -Kerri -Kerrie -Kerrill -Kerrin -Kerry -Kerstin -Kesley -Keslie -Kessia -Kessiah -Ketti -Kettie -Ketty -Kevina -Kevyn -Ki -Kia -Kiah -Kial -Kiele -Kiersten -Kikelia -Kiley -Kim -Kimberlee -Kimberley -Kimberli -Kimberly -Kimberlyn -Kimbra -Kimmi -Kimmie -Kimmy -Kinna -Kip -Kipp -Kippie -Kippy -Kira -Kirbee -Kirbie -Kirby -Kiri -Kirsten -Kirsteni -Kirsti -Kirstie -Kirstin -Kirstyn -Kissee -Kissiah -Kissie -Kit -Kitti -Kittie -Kitty -Kizzee -Kizzie -Klara -Klarika -Klarrisa -Konstance -Konstanze -Koo -Kora -Koral -Koralle -Kordula -Kore -Korella -Koren -Koressa -Kori -Korie -Korney -Korrie -Korry -Kourtney -Kris -Krissie -Krissy -Krista -Kristal -Kristan -Kriste -Kristel -Kristen -Kristi -Kristien -Kristin -Kristina -Kristine -Kristy -Kristyn -Krysta -Krystal -Krystalle -Krystle -Krystyna -Kyla -Kyle -Kylen -Kylie -Kylila -Kylynn -Kym -Kynthia -Kyrstin -La -Lacee -Lacey -Lacie -Lacy -Ladonna -Laetitia -Laila -Laina -Lainey -Lamb -Lana -Lane -Lanette -Laney -Lani -Lanie -Lanita -Lanna -Lanni -Lanny -Lara -Laraine -Lari -Larina -Larine -Larisa -Larissa -Lark -Laryssa -Latashia -Latia -Latisha -Latrena -Latrina -Laura -Lauraine -Laural -Lauralee -Laure -Lauree -Laureen -Laurel -Laurella -Lauren -Laurena -Laurene -Lauretta -Laurette -Lauri -Laurianne -Laurice -Laurie -Lauryn -Lavena -Laverna -Laverne -Lavina -Lavinia -Lavinie -Layla -Layne -Layney -Lea -Leah -Leandra -Leann -Leanna -Leanne -Leanor -Leanora -Lebbie -Leda -Lee -LeeAnn -Leeann -Leeanne -Leela -Leelah -Leena -Leesa -Leese -Legra -Leia -Leiah -Leigh -Leigha -Leila -Leilah -Leisha -Lela -Lelah -Leland -Lelia -Lena -Lenee -Lenette -Lenka -Lenna -Lenora -Lenore -Leodora -Leoine -Leola -Leoline -Leona -Leonanie -Leone -Leonelle -Leonie -Leonora -Leonore -Leontine -Leontyne -Leora -Leorah -Leshia -Lesley -Lesli -Leslie -Lesly -Lesya -Leta -Lethia -Leticia -Letisha -Letitia -Letta -Letti -Lettie -Letty -Leyla -Lezlie -Lia -Lian -Liana -Liane -Lianna -Lianne -Lib -Libbey -Libbi -Libbie -Libby -Licha -Lida -Lidia -Lil -Lila -Lilah -Lilas -Lilia -Lilian -Liliane -Lilias -Lilith -Lilla -Lilli -Lillian -Lillis -Lilllie -Lilly -Lily -Lilyan -Lin -Lina -Lind -Linda -Lindi -Lindie -Lindsay -Lindsey -Lindsy -Lindy -Linea -Linell -Linet -Linette -Linn -Linnea -Linnell -Linnet -Linnie -Linzy -Liora -Liorah -Lira -Lisa -Lisabeth -Lisandra -Lisbeth -Lise -Lisetta -Lisette -Lisha -Lishe -Lissa -Lissi -Lissie -Lissy -Lita -Liuka -Livia -Liz -Liza -Lizabeth -Lizbeth -Lizette -Lizzie -Lizzy -Loella -Lois -Loise -Lola -Lolande -Loleta -Lolita -Lolly -Lona -Lonee -Loni -Lonna -Lonni -Lonnie -Lora -Lorain -Loraine -Loralee -Loralie -Loralyn -Loree -Loreen -Lorelei -Lorelle -Loren -Lorena -Lorene -Lorenza -Loretta -Lorettalorna -Lorette -Lori -Loria -Lorianna -Lorianne -Lorie -Lorilee -Lorilyn -Lorinda -Lorine -Lorita -Lorna -Lorne -Lorraine -Lorrayne -Lorri -Lorrie -Lorrin -Lorry -Lory -Lotta -Lotte -Lotti -Lottie -Lotty -Lou -Louella -Louisa -Louise -Louisette -Love -Luana -Luanna -Luce -Luci -Lucia -Luciana -Lucie -Lucienne -Lucila -Lucilia -Lucille -Lucina -Lucinda -Lucine -Lucita -Lucky -Lucretia -Lucy -Luella -Luelle -Luisa -Luise -Lula -Lulita -Lulu -Luna -Lura -Lurette -Lurleen -Lurlene -Lurline -Lusa -Lust -Lyda -Lydia -Lydie -Lyn -Lynda -Lynde -Lyndel -Lyndell -Lyndsay -Lyndsey -Lyndsie -Lyndy -Lynea -Lynelle -Lynett -Lynette -Lynn -Lynna -Lynne -Lynnea -Lynnell -Lynnelle -Lynnet -Lynnett -Lynnette -Lynsey -Lysandra -Lyssa -Mab -Mabel -Mabelle -Mable -Mada -Madalena -Madalyn -Maddalena -Maddi -Maddie -Maddy -Madel -Madelaine -Madeleine -Madelena -Madelene -Madelin -Madelina -Madeline -Madella -Madelle -Madelon -Madelyn -Madge -Madlen -Madlin -Madona -Madonna -Mady -Mae -Maegan -Mag -Magda -Magdaia -Magdalen -Magdalena -Magdalene -Maggee -Maggi -Maggie -Maggy -Magna -Mahala -Mahalia -Maia -Maible -Maiga -Mair -Maire -Mairead -Maisey -Maisie -Mala -Malanie -Malcah -Malena -Malia -Malina -Malinda -Malinde -Malissa -Malissia -Malka -Malkah -Mallissa -Mallorie -Mallory -Malorie -Malory -Malva -Malvina -Malynda -Mame -Mamie -Manda -Mandi -Mandie -Mandy -Manon -Manya -Mara -Marabel -Marcela -Marcelia -Marcella -Marcelle -Marcellina -Marcelline -Marchelle -Marci -Marcia -Marcie -Marcile -Marcille -Marcy -Mareah -Maren -Marena -Maressa -Marga -Margalit -Margalo -Margaret -Margareta -Margarete -Margaretha -Margarethe -Margaretta -Margarette -Margarita -Margaux -Marge -Margeaux -Margery -Marget -Margette -Margi -Margie -Margit -Marglerite -Margo -Margot -Margret -Marguerite -Margurite -Margy -Mari -Maria -Mariam -Marian -Mariana -Mariann -Marianna -Marianne -Maribel -Maribelle -Maribeth -Marice -Maridel -Marie -Marie-Ann -Marie-Jeanne -Marieann -Mariejeanne -Mariel -Mariele -Marielle -Mariellen -Marietta -Mariette -Marigold -Marijo -Marika -Marilee -Marilin -Marillin -Marilyn -Marin -Marina -Marinna -Marion -Mariquilla -Maris -Marisa -Mariska -Marissa -Marit -Marita -Maritsa -Mariya -Marj -Marja -Marje -Marji -Marjie -Marjorie -Marjory -Marjy -Marketa -Marla -Marlane -Marleah -Marlee -Marleen -Marlena -Marlene -Marley -Marlie -Marline -Marlo -Marlyn -Marna -Marne -Marney -Marni -Marnia -Marnie -Marquita -Marrilee -Marris -Marrissa -Marry -Marsha -Marsiella -Marta -Martelle -Martguerita -Martha -Marthe -Marthena -Marti -Martica -Martie -Martina -Martita -Marty -Martynne -Mary -Marya -Maryangelyn -Maryann -Maryanna -Maryanne -Marybelle -Marybeth -Maryellen -Maryjane -Maryjo -Maryl -Marylee -Marylin -Marylinda -Marylou -Marylynne -Maryrose -Marys -Marysa -Masha -Matelda -Mathilda -Mathilde -Matilda -Matilde -Matti -Mattie -Matty -Maud -Maude -Maudie -Maura -Maure -Maureen -Maureene -Maurene -Maurine -Maurise -Maurita -Mavis -Mavra -Max -Maxi -Maxie -Maxine -Maxy -May -Maya -Maybelle -Mayda -Maye -Mead -Meade -Meagan -Meaghan -Meara -Mechelle -Meg -Megan -Megen -Meggan -Meggi -Meggie -Meggy -Meghan -Meghann -Mehetabel -Mei -Meira -Mel -Mela -Melamie -Melania -Melanie -Melantha -Melany -Melba -Melesa -Melessa -Melicent -Melina -Melinda -Melinde -Melisa -Melisande -Melisandra -Melisenda -Melisent -Melissa -Melisse -Melita -Melitta -Mella -Melli -Mellicent -Mellie -Mellisa -Mellisent -Mellissa -Melloney -Melly -Melodee -Melodie -Melody -Melonie -Melony -Melosa -Melva -Mercedes -Merci -Mercie -Mercy -Meredith -Meredithe -Meridel -Meridith -Meriel -Merilee -Merilyn -Meris -Merissa -Merl -Merla -Merle -Merlina -Merline -Merna -Merola -Merralee -Merridie -Merrie -Merrielle -Merrile -Merrilee -Merrili -Merrill -Merrily -Merry -Mersey -Meryl -Meta -Mia -Micaela -Michaela -Michaelina -Michaeline -Michaella -Michal -Michel -Michele -Michelina -Micheline -Michell -Michelle -Micki -Mickie -Micky -Midge -Mignon -Mignonne -Miguela -Miguelita -Mikako -Mildred -Mildrid -Milena -Milicent -Milissent -Milka -Milli -Millicent -Millie -Millisent -Milly -Milzie -Mimi -Min -Mina -Minda -Mindy -Minerva -Minetta -Minette -Minna -Minni -Minnie -Minny -Minta -Miquela -Mira -Mirabel -Mirabella -Mirabelle -Miran -Miranda -Mireielle -Mireille -Mirella -Mirelle -Miriam -Mirilla -Mirna -Misha -Missie -Missy -Misti -Misty -Mitra -Mitzi -Mmarianne -Modesta -Modestia -Modestine -Modesty -Moina -Moira -Moll -Mollee -Molli -Mollie -Molly -Mommy -Mona -Monah -Monica -Monika -Monique -Mora -Moreen -Morena -Morgan -Morgana -Morganica -Morganne -Morgen -Moria -Morissa -Morlee -Morna -Moselle -Moya -Moyna -Moyra -Mozelle -Muffin -Mufi -Mufinella -Muire -Mureil -Murial -Muriel -Murielle -Myna -Myra -Myrah -Myranda -Myriam -Myrilla -Myrle -Myrlene -Myrna -Myrta -Myrtia -Myrtice -Myrtie -Myrtle -Nada -Nadean -Nadeen -Nadia -Nadine -Nadiya -Nady -Nadya -Nalani -Nan -Nana -Nananne -Nance -Nancee -Nancey -Nanci -Nancie -Nancy -Nanete -Nanette -Nani -Nanice -Nanine -Nannette -Nanni -Nannie -Nanny -Nanon -Naoma -Naomi -Nara -Nari -Nariko -Nat -Nata -Natala -Natalee -Natalia -Natalie -Natalina -Nataline -Natalya -Natasha -Natassia -Nathalia -Nathalie -Natka -Natty -Neala -Neda -Nedda -Nedi -Neely -Neila -Neile -Neilla -Neille -Nela -Nelia -Nelie -Nell -Nelle -Nelli -Nellie -Nelly -Nena -Nerissa -Nerita -Nert -Nerta -Nerte -Nerti -Nertie -Nerty -Nessa -Nessi -Nessie -Nessy -Nesta -Netta -Netti -Nettie -Nettle -Netty -Nevsa -Neysa -Nichol -Nichole -Nicholle -Nicki -Nickie -Nicky -Nicol -Nicola -Nicole -Nicolea -Nicolette -Nicoli -Nicolina -Nicoline -Nicolle -Nidia -Nike -Niki -Nikki -Nikkie -Nikoletta -Nikolia -Nil -Nina -Ninetta -Ninette -Ninnetta -Ninnette -Ninon -Nisa -Nissa -Nisse -Nissie -Nissy -Nita -Nitin -Nixie -Noami -Noel -Noelani -Noell -Noella -Noelle -Noellyn -Noelyn -Noemi -Nola -Nolana -Nolie -Nollie -Nomi -Nona -Nonah -Noni -Nonie -Nonna -Nonnah -Nora -Norah -Norean -Noreen -Norene -Norina -Norine -Norma -Norri -Norrie -Norry -Nova -Novelia -Nydia -Nyssa -Octavia -Odele -Odelia -Odelinda -Odella -Odelle -Odessa -Odetta -Odette -Odilia -Odille -Ofelia -Ofella -Ofilia -Ola -Olenka -Olga -Olia -Olimpia -Olive -Olivette -Olivia -Olivie -Oliy -Ollie -Olly -Olva -Olwen -Olympe -Olympia -Olympie -Ondrea -Oneida -Onida -Onlea -Oona -Opal -Opalina -Opaline -Ophelia -Ophelie -Oprah -Ora -Oralee -Oralia -Oralie -Oralla -Oralle -Orel -Orelee -Orelia -Orelie -Orella -Orelle -Oreste -Oriana -Orly -Orsa -Orsola -Ortensia -Otha -Othelia -Othella -Othilia -Othilie -Ottilie -Pacifica -Page -Paige -Paloma -Pam -Pamela -Pamelina -Pamella -Pammi -Pammie -Pammy -Pandora -Pansie -Pansy -Paola -Paolina -Parwane -Pat -Patience -Patrica -Patrice -Patricia -Patrizia -Patsy -Patti -Pattie -Patty -Paula -Paula-Grace -Paule -Pauletta -Paulette -Pauli -Paulie -Paulina -Pauline -Paulita -Pauly -Pavia -Pavla -Pearl -Pearla -Pearle -Pearline -Peg -Pegeen -Peggi -Peggie -Peggy -Pen -Penelopa -Penelope -Penni -Pennie -Penny -Pepi -Pepita -Peri -Peria -Perl -Perla -Perle -Perri -Perrine -Perry -Persis -Pet -Peta -Petra -Petrina -Petronella -Petronia -Petronilla -Petronille -Petunia -Phaedra -Phaidra -Phebe -Phedra -Phelia -Phil -Philipa -Philippa -Philippe -Philippine -Philis -Phillida -Phillie -Phillis -Philly -Philomena -Phoebe -Phylis -Phyllida -Phyllis -Phyllys -Phylys -Pia -Pier -Pierette -Pierrette -Pietra -Piper -Pippa -Pippy -Polly -Pollyanna -Pooh -Poppy -Portia -Pris -Prisca -Priscella -Priscilla -Prissie -Pru -Prudence -Prudi -Prudy -Prue -Prunella -Queada -Queenie -Quentin -Querida -Quinn -Quinta -Quintana -Quintilla -Quintina -Rachael -Rachel -Rachele -Rachelle -Rae -Raf -Rafa -Rafaela -Rafaelia -Rafaelita -Ragnhild -Rahal -Rahel -Raina -Raine -Rakel -Ralina -Ramona -Ramonda -Rana -Randa -Randee -Randene -Randi -Randie -Randy -Ranee -Rani -Rania -Ranice -Ranique -Ranna -Raphaela -Raquel -Raquela -Rasia -Rasla -Raven -Ray -Raychel -Raye -Rayna -Raynell -Rayshell -Rea -Reba -Rebbecca -Rebe -Rebeca -Rebecca -Rebecka -Rebeka -Rebekah -Rebekkah -Ree -Reeba -Reena -Reeta -Reeva -Regan -Reggi -Reggie -Regina -Regine -Reiko -Reina -Reine -Remy -Rena -Renae -Renata -Renate -Rene -Renee -Renel -Renell -Renelle -Renie -Rennie -Reta -Retha -Revkah -Rey -Reyna -Rhea -Rheba -Rheta -Rhetta -Rhiamon -Rhianna -Rhianon -Rhoda -Rhodia -Rhodie -Rhody -Rhona -Rhonda -Riane -Riannon -Rianon -Rica -Ricca -Rici -Ricki -Rickie -Ricky -Riki -Rikki -Rina -Risa -Rissa -Rita -Riva -Rivalee -Rivi -Rivkah -Rivy -Roana -Roanna -Roanne -Robbi -Robbie -Robbin -Robby -Robbyn -Robena -Robenia -Roberta -Robin -Robina -Robinet -Robinett -Robinetta -Robinette -Robinia -Roby -Robyn -Roch -Rochell -Rochella -Rochelle -Rochette -Roda -Rodi -Rodie -Rodina -Romola -Romona -Romonda -Romy -Rona -Ronalda -Ronda -Ronica -Ronna -Ronni -Ronnica -Ronnie -Ronny -Roobbie -Rora -Rori -Rorie -Rory -Ros -Rosa -Rosabel -Rosabella -Rosabelle -Rosaleen -Rosalia -Rosalie -Rosalind -Rosalinda -Rosalinde -Rosaline -Rosalyn -Rosalynd -Rosamond -Rosamund -Rosana -Rosanna -Rosanne -Rosario -Rose -Roseann -Roseanna -Roseanne -Roselia -Roselin -Roseline -Rosella -Roselle -Roselyn -Rosemaria -Rosemarie -Rosemary -Rosemonde -Rosene -Rosetta -Rosette -Roshelle -Rosie -Rosina -Rosita -Roslyn -Rosmunda -Rosy -Row -Rowe -Rowena -Roxana -Roxane -Roxanna -Roxanne -Roxi -Roxie -Roxine -Roxy -Roz -Rozalie -Rozalin -Rozamond -Rozanna -Rozanne -Roze -Rozele -Rozella -Rozelle -Rozina -Rubetta -Rubi -Rubia -Rubie -Rubina -Ruby -Ruella -Ruperta -Ruth -Ruthann -Ruthanne -Ruthe -Ruthi -Ruthie -Ruthy -Ryann -Rycca -Saba -Sabina -Sabine -Sabra -Sabrina -Sacha -Sada -Sadella -Sadie -Sal -Sallee -Salli -Sallie -Sally -Sallyann -Sallyanne -Salome -Sam -Samantha -Samara -Samaria -Sammy -Samuela -Samuella -Sande -Sandi -Sandie -Sandra -Sandy -Sandye -Sapphira -Sapphire -Sara -Sara-Ann -Saraann -Sarah -Sarajane -Saree -Sarena -Sarene -Sarette -Sari -Sarina -Sarine -Sarita -Sascha -Sasha -Sashenka -Saudra -Saundra -Savina -Sayre -Scarlet -Scarlett -Scotty -Sean -Seana -Secunda -Seka -Sela -Selena -Selene -Selestina -Selia -Selie -Selina -Selinda -Seline -Sella -Selle -Selma -Sena -Sephira -Serena -Serene -Shaina -Shaine -Shalna -Shalne -Shamit -Shana -Shanda -Shandee -Shandie -Shandra -Shandy -Shane -Shani -Shanie -Shanna -Shannah -Shannen -Shannon -Shanon -Shanta -Shantee -Shara -Sharai -Shari -Sharia -Sharie -Sharity -Sharl -Sharla -Sharleen -Sharlene -Sharline -Sharna -Sharon -Sharona -Sharra -Sharron -Sharyl -Shaun -Shauna -Shawn -Shawna -Shawnee -Shay -Shayla -Shaylah -Shaylyn -Shaylynn -Shayna -Shayne -Shea -Sheba -Sheela -Sheelagh -Sheelah -Sheena -Sheeree -Sheila -Sheila-Kathryn -Sheilah -Sheilakathryn -Shel -Shela -Shelagh -Shelba -Shelbi -Shelby -Shelia -Shell -Shelley -Shelli -Shellie -Shelly -Shena -Sher -Sheree -Sheri -Sherie -Sheril -Sherill -Sherilyn -Sherline -Sherri -Sherrie -Sherry -Sherye -Sheryl -Shilpa -Shina -Shir -Shira -Shirah -Shirl -Shirlee -Shirleen -Shirlene -Shirley -Shirline -Shoshana -Shoshanna -Shoshie -Siana -Sianna -Sib -Sibbie -Sibby -Sibeal -Sibel -Sibella -Sibelle -Sibilla -Sibley -Sibyl -Sibylla -Sibylle -Sidoney -Sidonia -Sidonnie -Sigrid -Sile -Sileas -Silva -Silvana -Silvia -Silvie -Simona -Simone -Simonette -Simonne -Sindee -Sinead -Siobhan -Sioux -Siouxie -Sisely -Sisile -Sissie -Sissy -Sofia -Sofie -Solange -Sondra -Sonia -Sonja -Sonni -Sonnie -Sonnnie -Sonny -Sonya -Sophey -Sophi -Sophia -Sophie -Sophronia -Sorcha -Sosanna -Stace -Stacee -Stacey -Staci -Stacia -Stacie -Stacy -Stafani -Star -Starla -Starlene -Starlin -Starr -Stefa -Stefania -Stefanie -Steffane -Steffi -Steffie -Stella -Stepha -Stephana -Stephani -Stephanie -Stephannie -Stephenie -Stephi -Stephie -Stephine -Stesha -Stevana -Stevena -Stoddard -Storey -Storm -Stormi -Stormie -Stormy -Sue -Sue-elle -Suellen -Sukey -Suki -Sula -Sunny -Sunshine -Susan -Susana -Susanetta -Susann -Susanna -Susannah -Susanne -Susette -Susi -Susie -Sussi -Susy -Suzan -Suzann -Suzanna -Suzanne -Suzetta -Suzette -Suzi -Suzie -Suzy -Suzzy -Sybil -Sybila -Sybilla -Sybille -Sybyl -Sydel -Sydelle -Sydney -Sylvia -Sylvie -Tabatha -Tabbatha -Tabbi -Tabbie -Tabbitha -Tabby -Tabina -Tabitha -Taffy -Talia -Tallia -Tallie -Tally -Talya -Talyah -Tamar -Tamara -Tamarah -Tamarra -Tamera -Tami -Tamiko -Tamma -Tammara -Tammi -Tammie -Tammy -Tamra -Tana -Tandi -Tandie -Tandy -Tani -Tania -Tansy -Tanya -Tara -Tarah -Tarra -Tarrah -Taryn -Tasha -Tasia -Tate -Tatiana -Tatiania -Tatum -Tawnya -Tawsha -Teane -Ted -Tedda -Teddi -Teddie -Teddy -Tedi -Tedra -Teena -Tella -Teodora -Tera -Teresa -TeresaAnne -Terese -Teresina -Teresita -Teressa -Teri -Teriann -Terina -Terra -Terri -Terri-Jo -Terrianne -Terrie -Terry -Terrye -Tersina -Teryl -Terza -Tess -Tessa -Tessi -Tessie -Tessy -Thalia -Thea -Theada -Theadora -Theda -Thekla -Thelma -Theo -Theodora -Theodosia -Theresa -Theresa-Marie -Therese -Theresina -Theresita -Theressa -Therine -Thia -Thomasa -Thomasin -Thomasina -Thomasine -Tia -Tiana -Tiena -Tierney -Tiertza -Tiff -Tiffani -Tiffanie -Tiffany -Tiffi -Tiffie -Tiffy -Tilda -Tildi -Tildie -Tildy -Tillie -Tilly -Tim -Timi -Timmi -Timmie -Timmy -Timothea -Tina -Tine -Tiphani -Tiphanie -Tiphany -Tish -Tisha -Tobe -Tobey -Tobi -Tobie -Toby -Tobye -Toinette -Toma -Tomasina -Tomasine -Tomi -Tomiko -Tommi -Tommie -Tommy -Toni -Tonia -Tonie -Tony -Tonya -Tootsie -Torey -Tori -Torie -Torrie -Tory -Tova -Tove -Trace -Tracee -Tracey -Traci -Tracie -Tracy -Trenna -Tresa -Trescha -Tressa -Tricia -Trina -Trish -Trisha -Trista -Trix -Trixi -Trixie -Trixy -Truda -Trude -Trudey -Trudi -Trudie -Trudy -Trula -Tuesday -Twila -Twyla -Tybi -Tybie -Tyne -Ula -Ulla -Ulrica -Ulrika -Ulrike -Umeko -Una -Ursa -Ursala -Ursola -Ursula -Ursulina -Ursuline -Uta -Val -Valaree -Valaria -Vale -Valeda -Valencia -Valene -Valenka -Valentia -Valentina -Valentine -Valera -Valeria -Valerie -Valery -Valerye -Valida -Valina -Valli -Vallie -Vally -Valma -Valry -Van -Vanda -Vanessa -Vania -Vanna -Vanni -Vannie -Vanny -Vanya -Veda -Velma -Velvet -Vena -Venita -Ventura -Venus -Vera -Veradis -Vere -Verena -Verene -Veriee -Verile -Verina -Verine -Verla -Verna -Vernice -Veronica -Veronika -Veronike -Veronique -Vi -Vicki -Vickie -Vicky -Victoria -Vida -Viki -Vikki -Vikkie -Vikky -Vilhelmina -Vilma -Vin -Vina -Vinita -Vinni -Vinnie -Vinny -Viola -Violante -Viole -Violet -Violetta -Violette -Virgie -Virgina -Virginia -Virginie -Vita -Vitia -Vitoria -Vittoria -Viv -Viva -Vivi -Vivia -Vivian -Viviana -Vivianna -Vivianne -Vivie -Vivien -Viviene -Vivienne -Viviyan -Vivyan -Vivyanne -Vonni -Vonnie -Vonny -Wallie -Wallis -Wally -Waly -Wanda -Wandie -Wandis -Waneta -Wenda -Wendeline -Wendi -Wendie -Wendy -Wenona -Wenonah -Whitney -Wileen -Wilhelmina -Wilhelmine -Wilie -Willa -Willabella -Willamina -Willetta -Willette -Willi -Willie -Willow -Willy -Willyt -Wilma -Wilmette -Wilona -Wilone -Wilow -Windy -Wini -Winifred -Winna -Winnah -Winne -Winni -Winnie -Winnifred -Winny -Winona -Winonah -Wren -Wrennie -Wylma -Wynn -Wynne -Wynnie -Wynny -Xaviera -Xena -Xenia -Xylia -Xylina -Yalonda -Yehudit -Yelena -Yetta -Yettie -Yetty -Yevette -Yoko -Yolanda -Yolande -Yolane -Yolanthe -Yonina -Yoshi -Yoshiko -Yovonnda -Yvette -Yvonne -Zabrina -Zahara -Zandra -Zaneta -Zara -Zarah -Zaria -Zarla -Zea -Zelda -Zelma -Zena -Zenia -Zia -Zilvia -Zita -Zitella -Zoe -Zola -Zonda -Zondra -Zonnya -Zora -Zorah -Zorana -Zorina -Zorine -Zsa Zsa -Zsazsa -Zulema -Zuzana diff --git a/exercises/02450Toolbox_Python/Data/iris.csv b/exercises/02450Toolbox_Python/Data/iris.csv deleted file mode 100644 index f984b2bf7779ea99e49697b02ae92f533c448ba6..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/iris.csv +++ /dev/null @@ -1,151 +0,0 @@ -"Sepal Length","Sepal Width","Petal Length","Petal Width","Type" -5.1,3.5,1.4,0.2,"Iris-setosa" -4.9,3,1.4,0.2,"Iris-setosa" -4.7,3.2,1.3,0.2,"Iris-setosa" -4.6,3.1,1.5,0.2,"Iris-setosa" -5,3.6,1.4,0.2,"Iris-setosa" -5.4,3.9,1.7,0.4,"Iris-setosa" -4.6,3.4,1.4,0.3,"Iris-setosa" -5,3.4,1.5,0.2,"Iris-setosa" -4.4,2.9,1.4,0.2,"Iris-setosa" -4.9,3.1,1.5,0.1,"Iris-setosa" -5.4,3.7,1.5,0.2,"Iris-setosa" -4.8,3.4,1.6,0.2,"Iris-setosa" -4.8,3,1.4,0.1,"Iris-setosa" -4.3,3,1.1,0.1,"Iris-setosa" -5.8,4,1.2,0.2,"Iris-setosa" -5.7,4.4,1.5,0.4,"Iris-setosa" -5.4,3.9,1.3,0.4,"Iris-setosa" -5.1,3.5,1.4,0.3,"Iris-setosa" -5.7,3.8,1.7,0.3,"Iris-setosa" -5.1,3.8,1.5,0.3,"Iris-setosa" -5.4,3.4,1.7,0.2,"Iris-setosa" -5.1,3.7,1.5,0.4,"Iris-setosa" -4.6,3.6,1,0.2,"Iris-setosa" -5.1,3.3,1.7,0.5,"Iris-setosa" -4.8,3.4,1.9,0.2,"Iris-setosa" -5,3,1.6,0.2,"Iris-setosa" -5,3.4,1.6,0.4,"Iris-setosa" -5.2,3.5,1.5,0.2,"Iris-setosa" -5.2,3.4,1.4,0.2,"Iris-setosa" -4.7,3.2,1.6,0.2,"Iris-setosa" -4.8,3.1,1.6,0.2,"Iris-setosa" -5.4,3.4,1.5,0.4,"Iris-setosa" -5.2,4.1,1.5,0.1,"Iris-setosa" -5.5,4.2,1.4,0.2,"Iris-setosa" -4.9,3.1,1.5,0.1,"Iris-setosa" -5,3.2,1.2,0.2,"Iris-setosa" -5.5,3.5,1.3,0.2,"Iris-setosa" -4.9,3.1,1.5,0.1,"Iris-setosa" -4.4,3,1.3,0.2,"Iris-setosa" -5.1,3.4,1.5,0.2,"Iris-setosa" -5,3.5,1.3,0.3,"Iris-setosa" -4.5,2.3,1.3,0.3,"Iris-setosa" -4.4,3.2,1.3,0.2,"Iris-setosa" -5,3.5,1.6,0.6,"Iris-setosa" -5.1,3.8,1.9,0.4,"Iris-setosa" -4.8,3,1.4,0.3,"Iris-setosa" -5.1,3.8,1.6,0.2,"Iris-setosa" -4.6,3.2,1.4,0.2,"Iris-setosa" -5.3,3.7,1.5,0.2,"Iris-setosa" -5,3.3,1.4,0.2,"Iris-setosa" -7,3.2,4.7,1.4,"Iris-versicolor" -6.4,3.2,4.5,1.5,"Iris-versicolor" -6.9,3.1,4.9,1.5,"Iris-versicolor" -5.5,2.3,4,1.3,"Iris-versicolor" -6.5,2.8,4.6,1.5,"Iris-versicolor" -5.7,2.8,4.5,1.3,"Iris-versicolor" -6.3,3.3,4.7,1.6,"Iris-versicolor" -4.9,2.4,3.3,1,"Iris-versicolor" -6.6,2.9,4.6,1.3,"Iris-versicolor" -5.2,2.7,3.9,1.4,"Iris-versicolor" -5,2,3.5,1,"Iris-versicolor" -5.9,3,4.2,1.5,"Iris-versicolor" -6,2.2,4,1,"Iris-versicolor" -6.1,2.9,4.7,1.4,"Iris-versicolor" -5.6,2.9,3.6,1.3,"Iris-versicolor" -6.7,3.1,4.4,1.4,"Iris-versicolor" -5.6,3,4.5,1.5,"Iris-versicolor" -5.8,2.7,4.1,1,"Iris-versicolor" -6.2,2.2,4.5,1.5,"Iris-versicolor" -5.6,2.5,3.9,1.1,"Iris-versicolor" -5.9,3.2,4.8,1.8,"Iris-versicolor" -6.1,2.8,4,1.3,"Iris-versicolor" -6.3,2.5,4.9,1.5,"Iris-versicolor" -6.1,2.8,4.7,1.2,"Iris-versicolor" -6.4,2.9,4.3,1.3,"Iris-versicolor" -6.6,3,4.4,1.4,"Iris-versicolor" -6.8,2.8,4.8,1.4,"Iris-versicolor" -6.7,3,5,1.7,"Iris-versicolor" -6,2.9,4.5,1.5,"Iris-versicolor" -5.7,2.6,3.5,1,"Iris-versicolor" -5.5,2.4,3.8,1.1,"Iris-versicolor" -5.5,2.4,3.7,1,"Iris-versicolor" -5.8,2.7,3.9,1.2,"Iris-versicolor" -6,2.7,5.1,1.6,"Iris-versicolor" -5.4,3,4.5,1.5,"Iris-versicolor" -6,3.4,4.5,1.6,"Iris-versicolor" -6.7,3.1,4.7,1.5,"Iris-versicolor" -6.3,2.3,4.4,1.3,"Iris-versicolor" -5.6,3,4.1,1.3,"Iris-versicolor" -5.5,2.5,4,1.3,"Iris-versicolor" -5.5,2.6,4.4,1.2,"Iris-versicolor" -6.1,3,4.6,1.4,"Iris-versicolor" -5.8,2.6,4,1.2,"Iris-versicolor" -5,2.3,3.3,1,"Iris-versicolor" -5.6,2.7,4.2,1.3,"Iris-versicolor" -5.7,3,4.2,1.2,"Iris-versicolor" -5.7,2.9,4.2,1.3,"Iris-versicolor" -6.2,2.9,4.3,1.3,"Iris-versicolor" -5.1,2.5,3,1.1,"Iris-versicolor" -5.7,2.8,4.1,1.3,"Iris-versicolor" -6.3,3.3,6,2.5,"Iris-virginica" -5.8,2.7,5.1,1.9,"Iris-virginica" -7.1,3,5.9,2.1,"Iris-virginica" -6.3,2.9,5.6,1.8,"Iris-virginica" -6.5,3,5.8,2.2,"Iris-virginica" -7.6,3,6.6,2.1,"Iris-virginica" -4.9,2.5,4.5,1.7,"Iris-virginica" -7.3,2.9,6.3,1.8,"Iris-virginica" -6.7,2.5,5.8,1.8,"Iris-virginica" -7.2,3.6,6.1,2.5,"Iris-virginica" -6.5,3.2,5.1,2,"Iris-virginica" -6.4,2.7,5.3,1.9,"Iris-virginica" -6.8,3,5.5,2.1,"Iris-virginica" -5.7,2.5,5,2,"Iris-virginica" -5.8,2.8,5.1,2.4,"Iris-virginica" -6.4,3.2,5.3,2.3,"Iris-virginica" -6.5,3,5.5,1.8,"Iris-virginica" -7.7,3.8,6.7,2.2,"Iris-virginica" -7.7,2.6,6.9,2.3,"Iris-virginica" -6,2.2,5,1.5,"Iris-virginica" -6.9,3.2,5.7,2.3,"Iris-virginica" -5.6,2.8,4.9,2,"Iris-virginica" -7.7,2.8,6.7,2,"Iris-virginica" -6.3,2.7,4.9,1.8,"Iris-virginica" -6.7,3.3,5.7,2.1,"Iris-virginica" -7.2,3.2,6,1.8,"Iris-virginica" -6.2,2.8,4.8,1.8,"Iris-virginica" -6.1,3,4.9,1.8,"Iris-virginica" -6.4,2.8,5.6,2.1,"Iris-virginica" -7.2,3,5.8,1.6,"Iris-virginica" -7.4,2.8,6.1,1.9,"Iris-virginica" -7.9,3.8,6.4,2,"Iris-virginica" -6.4,2.8,5.6,2.2,"Iris-virginica" -6.3,2.8,5.1,1.5,"Iris-virginica" -6.1,2.6,5.6,1.4,"Iris-virginica" -7.7,3,6.1,2.3,"Iris-virginica" -6.3,3.4,5.6,2.4,"Iris-virginica" -6.4,3.1,5.5,1.8,"Iris-virginica" -6,3,4.8,1.8,"Iris-virginica" -6.9,3.1,5.4,2.1,"Iris-virginica" -6.7,3.1,5.6,2.4,"Iris-virginica" -6.9,3.1,5.1,2.3,"Iris-virginica" -5.8,2.7,5.1,1.9,"Iris-virginica" -6.8,3.2,5.9,2.3,"Iris-virginica" -6.7,3.3,5.7,2.5,"Iris-virginica" -6.7,3,5.2,2.3,"Iris-virginica" -6.3,2.5,5,1.9,"Iris-virginica" -6.5,3,5.2,2,"Iris-virginica" -6.2,3.4,5.4,2.3,"Iris-virginica" -5.9,3,5.1,1.8,"Iris-virginica" diff --git a/exercises/02450Toolbox_Python/Data/iris.mat b/exercises/02450Toolbox_Python/Data/iris.mat deleted file mode 100644 index df9348acff144a45cf7a7dd899395741201c3829..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/iris.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/iris.xls b/exercises/02450Toolbox_Python/Data/iris.xls deleted file mode 100644 index 0a3950140b342d9ee3de3bb285be5f2bb0625953..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/iris.xls and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/male.txt b/exercises/02450Toolbox_Python/Data/male.txt deleted file mode 100644 index bacce977f4f80fe6df231891486daa62acaeded5..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/male.txt +++ /dev/null @@ -1,2943 +0,0 @@ -Aamir -Aaron -Abbey -Abbie -Abbot -Abbott -Abby -Abdel -Abdul -Abdulkarim -Abdullah -Abe -Abel -Abelard -Abner -Abraham -Abram -Ace -Adair -Adam -Adams -Addie -Adger -Aditya -Adlai -Adnan -Adolf -Adolfo -Adolph -Adolphe -Adolpho -Adolphus -Adrian -Adrick -Adrien -Agamemnon -Aguinaldo -Aguste -Agustin -Aharon -Ahmad -Ahmed -Ahmet -Ajai -Ajay -Al -Alaa -Alain -Alan -Alasdair -Alastair -Albatros -Albert -Alberto -Albrecht -Alden -Aldis -Aldo -Aldric -Aldrich -Aldus -Aldwin -Alec -Aleck -Alejandro -Aleks -Aleksandrs -Alessandro -Alex -Alexander -Alexei -Alexis -Alf -Alfie -Alfonse -Alfonso -Alfonzo -Alford -Alfred -Alfredo -Algernon -Ali -Alic -Alister -Alix -Allah -Allan -Allen -Alley -Allie -Allin -Allyn -Alonso -Alonzo -Aloysius -Alphonse -Alphonso -Alston -Alton -Alvin -Alwin -Amadeus -Ambros -Ambrose -Ambrosi -Ambrosio -Ambrosius -Amery -Amory -Amos -Anatol -Anatole -Anatollo -Anatoly -Anders -Andie -Andonis -Andre -Andrea -Andreas -Andrej -Andres -Andrew -Andrey -Andri -Andros -Andrus -Andrzej -Andy -Angel -Angelico -Angelo -Angie -Angus -Ansel -Ansell -Anselm -Anson -Anthony -Antin -Antoine -Anton -Antone -Antoni -Antonin -Antonino -Antonio -Antonius -Antony -Anurag -Apollo -Apostolos -Aram -Archibald -Archibold -Archie -Archon -Archy -Arel -Ari -Arie -Ariel -Aristotle -Arlo -Armand -Armando -Armond -Armstrong -Arne -Arnie -Arnold -Arnoldo -Aron -Arron -Art -Arther -Arthur -Artie -Artur -Arturo -Arvie -Arvin -Arvind -Arvy -Ash -Ashby -Ashish -Ashley -Ashton -Aub -Aube -Aubert -Aubrey -Augie -August -Augustin -Augustine -Augusto -Augustus -Austen -Austin -Ave -Averell -Averil -Averill -Avery -Avi -Avraham -Avram -Avrom -Axel -Aylmer -Aziz -Bailey -Bailie -Baillie -Baily -Baird -Baldwin -Bancroft -Barbabas -Barclay -Bard -Barde -Barn -Barnabas -Barnabe -Barnaby -Barnard -Barnebas -Barnett -Barney -Barnie -Barny -Baron -Barr -Barret -Barrett -Barri -Barrie -Barris -Barron -Barry -Bart -Bartel -Barth -Barthel -Bartholemy -Bartholomeo -Bartholomeus -Bartholomew -Bartie -Bartlet -Bartlett -Bartolemo -Bartolomei -Bartolomeo -Barton -Barty -Bary -Basil -Batholomew -Baxter -Bay -Bayard -Beale -Bealle -Bear -Bearnard -Beau -Beaufort -Beauregard -Beck -Bela -Ben -Benedict -Bengt -Benito -Benjamen -Benjamin -Benji -Benjie -Benjy -Benn -Bennet -Bennett -Bennie -Benny -Benson -Bentley -Benton -Beowulf -Berchtold -Berk -Berke -Berkeley -Berkie -Berkley -Bernard -Bernardo -Bernd -Bernhard -Bernie -Bert -Bertie -Bertram -Bertrand -Bharat -Biff -Bill -Billie -Billy -Bing -Binky -Bishop -Bjorn -Bjorne -Blaine -Blair -Blake -Blare -Blayne -Bo -Bob -Bobbie -Bobby -Bogart -Bogdan -Boniface -Boris -Boyce -Boyd -Brad -Braden -Bradford -Bradley -Bradly -Brady -Brandon -Brandy -Brant -Brendan -Brent -Bret -Brett -Brewer -Brewster -Brian -Brice -Briggs -Brinkley -Britt -Brock -Broddie -Broddy -Broderic -Broderick -Brodie -Brody -Bronson -Brook -Brooke -Brooks -Bruce -Bruno -Bryan -Bryant -Bryce -Bryn -Bryon -Bubba -Buck -Bucky -Bud -Buddy -Burgess -Burke -Burl -Burnaby -Burt -Burton -Buster -Butch -Butler -Byram -Byron -Caesar -Cain -Cal -Caldwell -Caleb -Calhoun -Calvin -Cam -Cameron -Cammy -Carey -Carl -Carleigh -Carlie -Carlin -Carlo -Carlos -Carlton -Carlyle -Carmine -Carroll -Carson -Carsten -Carter -Cary -Caryl -Case -Casey -Caspar -Casper -Cass -Cat -Cecil -Cesar -Chad -Chadd -Chaddie -Chaddy -Chadwick -Chaim -Chalmers -Chan -Chance -Chancey -Chanderjit -Chandler -Chane -Chariot -Charles -Charleton -Charley -Charlie -Charlton -Chas -Chase -Chaunce -Chauncey -Che -Chelton -Chen -Chester -Cheston -Chet -Chev -Chevalier -Chevy -Chip -Chris -Chrissy -Christ -Christian -Christiano -Christie -Christof -Christofer -Christoph -Christophe -Christopher -Christorpher -Christos -Christy -Chrisy -Chuck -Churchill -Clair -Claire -Clancy -Clarance -Clare -Clarence -Clark -Clarke -Claude -Claudio -Claudius -Claus -Clay -Clayborn -Clayborne -Claybourne -Clayton -Cleland -Clem -Clemens -Clement -Clemente -Clemmie -Cletus -Cleveland -Cliff -Clifford -Clifton -Clint -Clinten -Clinton -Clive -Clyde -Cob -Cobb -Cobbie -Cobby -Cody -Colbert -Cole -Coleman -Colin -Collin -Collins -Conan -Connie -Connolly -Connor -Conrad -Conroy -Constantin -Constantine -Constantinos -Conway -Cooper -Corbin -Corby -Corey -Corky -Cornelius -Cornellis -Corrie -Cortese -Corwin -Cory -Cosmo -Costa -Courtney -Craig -Crawford -Creighton -Cris -Cristopher -Curt -Curtice -Curtis -Cy -Cyril -Cyrill -Cyrille -Cyrillus -Cyrus -Dabney -Daffy -Dale -Dallas -Dalton -Damian -Damien -Damon -Dan -Dana -Dane -Dani -Danie -Daniel -Dannie -Danny -Dante -Darby -Darcy -Daren -Darian -Darien -Darin -Dario -Darius -Darrel -Darrell -Darren -Darrick -Darrin -Darryl -Darth -Darwin -Daryl -Daryle -Dave -Davey -David -Davidde -Davide -Davidson -Davie -Davin -Davis -Davon -Davoud -Davy -Dawson -Dean -Deane -Del -Delbert -Dell -Delmar -Demetre -Demetri -Demetris -Demetrius -Demosthenis -Denis -Dennie -Dennis -Denny -Derby -Derek -Derick -Derk -Derrek -Derrick -Derrin -Derrol -Derron -Deryl -Desmond -Desmund -Devin -Devon -Dewey -Dewitt -Dexter -Dick -Dickey -Dickie -Diego -Dieter -Dietrich -Dillon -Dimitri -Dimitrios -Dimitris -Dimitrou -Dimitry -Dino -Dion -Dionis -Dionysus -Dirk -Dmitri -Dom -Domenic -Domenico -Dominic -Dominick -Dominique -Don -Donal -Donald -Donn -Donnie -Donny -Donovan -Dorian -Dory -Doug -Douggie -Dougie -Douglas -Douglass -Douglis -Dov -Doyle -Drake -Drew -Dru -Dryke -Duane -Dudley -Duffie -Duffy -Dugan -Duke -Dunc -Duncan -Dunstan -Durand -Durant -Durante -Durward -Dustin -Dwain -Dwaine -Dwane -Dwayne -Dwight -Dylan -Dyson -Earl -Earle -Easton -Eben -Ebeneser -Ebenezer -Eberhard -Ed -Eddie -Eddy -Edgar -Edgardo -Edie -Edmond -Edmund -Edouard -Edsel -Eduard -Eduardo -Edward -Edwin -Efram -Egbert -Ehud -Elbert -Elden -Eldon -Eli -Elias -Elihu -Elijah -Eliot -Eliott -Elisha -Elliot -Elliott -Ellis -Ellsworth -Ellwood -Elmer -Elmore -Elnar -Elric -Elroy -Elton -Elvin -Elvis -Elwin -Elwood -Elwyn -Ely -Emanuel -Emerson -Emery -Emil -Emile -Emilio -Emmanuel -Emmery -Emmet -Emmett -Emmit -Emmott -Emmy -Emory -Ender -Engelbart -Engelbert -Englebart -Englebert -Enoch -Enrico -Enrique -Ephraim -Ephram -Ephrayim -Ephrem -Er -Erasmus -Erastus -Erek -Erhard -Erhart -Eric -Erich -Erick -Erik -Erin -Erl -Ernest -Ernesto -Ernie -Ernst -Erny -Errol -Ervin -Erwin -Esau -Esme -Esteban -Ethan -Ethelbert -Ethelred -Etienne -Euclid -Eugen -Eugene -Eustace -Ev -Evan -Evelyn -Everard -Everett -Ewan -Ewart -Ez -Ezechiel -Ezekiel -Ezra -Fabian -Fabio -Fairfax -Farley -Fazeel -Federico -Felice -Felicio -Felipe -Felix -Ferd -Ferdie -Ferdinand -Ferdy -Fergus -Ferguson -Ferinand -Fernando -Fidel -Filbert -Filip -Filipe -Filmore -Finley -Finn -Fitz -Fitzgerald -Flem -Fleming -Flemming -Fletch -Fletcher -Flin -Flinn -Flint -Flipper -Florian -Floyd -Flynn -Fons -Fonsie -Fonz -Fonzie -Forbes -Ford -Forest -Forester -Forrest -Forrester -Forster -Foster -Fowler -Fox -Fran -Francesco -Francis -Francisco -Francois -Frank -Frankie -Franklin -Franklyn -Franky -Frans -Franz -Fraser -Frazier -Fred -Freddie -Freddy -Frederic -Frederich -Frederick -Frederico -Frederik -Fredric -Fredrick -Freeman -Freemon -Fremont -French -Friedric -Friedrich -Friedrick -Fritz -Fulton -Fyodor -Gabe -Gabriel -Gabriele -Gabriell -Gabriello -Gail -Gale -Galen -Gallagher -Gamaliel -Garcia -Garcon -Gardener -Gardiner -Gardner -Garey -Garfield -Garfinkel -Garold -Garp -Garret -Garrett -Garrot -Garrott -Garry -Garth -Garv -Garvey -Garvin -Garvy -Garwin -Garwood -Gary -Gaspar -Gasper -Gaston -Gav -Gaven -Gavin -Gavriel -Gay -Gayle -Gearard -Gene -Geo -Geof -Geoff -Geoffrey -Geoffry -Georg -George -Georges -Georgia -Georgie -Georgy -Gerald -Geraldo -Gerard -Gere -Gerhard -Gerhardt -Geri -Germaine -Gerold -Gerome -Gerrard -Gerri -Gerrit -Gerry -Gershom -Gershon -Giacomo -Gian -Giancarlo -Giavani -Gibb -Gideon -Giff -Giffard -Giffer -Giffie -Gifford -Giffy -Gil -Gilbert -Gilberto -Gilburt -Giles -Gill -Gilles -Ginger -Gino -Giordano -Giorgi -Giorgio -Giovanne -Giovanni -Giraldo -Giraud -Giuseppe -Glen -Glenn -Glynn -Godard -Godart -Goddard -Goddart -Godfree -Godfrey -Godfry -Godwin -Gomer -Gonzales -Gonzalo -Goober -Goose -Gordan -Gordie -Gordon -Grace -Grady -Graehme -Graeme -Graham -Graig -Grant -Granville -Greg -Gregg -Greggory -Gregor -Gregorio -Gregory -Gretchen -Griff -Griffin -Griffith -Griswold -Grove -Grover -Guido -Guillaume -Guillermo -Gunner -Gunter -Gunther -Gus -Gustaf -Gustav -Gustave -Gustavo -Gustavus -Guthrey -Guthrie -Guthry -Guy -Hadleigh -Hadley -Hadrian -Hagan -Hagen -Hailey -Hakeem -Hakim -Hal -Hale -Haleigh -Haley -Hall -Hallam -Halvard -Ham -Hamel -Hamid -Hamil -Hamilton -Hamish -Hamlen -Hamlet -Hamlin -Hammad -Hamnet -Han -Hanan -Hanford -Hank -Hannibal -Hans -Hans-Peter -Hansel -Hanson -Harald -Harcourt -Hari -Harlan -Harland -Harley -Harlin -Harman -Harmon -Harold -Harris -Harrison -Harrold -Harry -Hart -Hartley -Hartwell -Harv -Harvard -Harvey -Harvie -Harwell -Hasheem -Hashim -Haskel -Haskell -Hassan -Hastings -Hasty -Haven -Hayden -Haydon -Hayes -Hayward -Haywood -Hazel -Heath -Heathcliff -Hebert -Hector -Heinrich -Heinz -Helmuth -Henderson -Hendrick -Hendrik -Henri -Henrie -Henrik -Henrique -Henry -Herb -Herbert -Herbie -Herby -Hercule -Hercules -Herculie -Herman -Hermann -Hermon -Hermy -Hernando -Herold -Herrick -Herrmann -Hersch -Herschel -Hersh -Hershel -Herve -Hervey -Hew -Hewe -Hewet -Hewett -Hewie -Hewitt -Heywood -Hezekiah -Higgins -Hilary -Hilbert -Hill -Hillard -Hillary -Hillel -Hillery -Hilliard -Hilton -Hiralal -Hiram -Hiro -Hirsch -Hobart -Hodge -Hogan -Hollis -Holly -Homer -Horace -Horacio -Horatio -Horatius -Horst -Howard -Howie -Hoyt -Hubert -Hudson -Huey -Hugh -Hugo -Humbert -Humphrey -Hunt -Hunter -Huntington -Huntlee -Huntley -Hurley -Husain -Husein -Hussein -Hy -Hyatt -Hyman -Hymie -Iago -Iain -Ian -Ibrahim -Ichabod -Iggie -Iggy -Ignace -Ignacio -Ignacius -Ignatius -Ignaz -Ignazio -Igor -Ike -Ikey -Immanuel -Ingamar -Ingelbert -Ingemar -Inglebert -Ingmar -Ingram -Inigo -Ira -Irvin -Irvine -Irving -Irwin -Isa -Isaac -Isaak -Isador -Isadore -Isaiah -Ishmael -Isidore -Ismail -Israel -Istvan -Ivan -Ivor -Izaak -Izak -Izzy -Jabez -Jack -Jackie -Jackson -Jacob -Jacques -Jae -Jaime -Jake -Jakob -James -Jameson -Jamey -Jamie -Jan -Janos -Janus -Jared -Jarrett -Jarvis -Jason -Jasper -Javier -Jay -Jean -Jean-Christophe -Jean-Francois -Jean-Lou -Jean-Luc -Jean-Marc -Jean-Paul -Jean-Pierre -Jeb -Jed -Jedediah -Jef -Jeff -Jefferey -Jefferson -Jeffery -Jeffie -Jeffrey -Jeffry -Jefry -Jehu -Jennings -Jens -Jephthah -Jerald -Jeramie -Jere -Jereme -Jeremiah -Jeremias -Jeremie -Jeremy -Jermain -Jermaine -Jermayne -Jerold -Jerome -Jeromy -Jerri -Jerrie -Jerrold -Jerrome -Jerry -Jervis -Jerzy -Jess -Jesse -Jessee -Jessey -Jessie -Jesus -Jeth -Jethro -Jim -Jimbo -Jimmie -Jimmy -Jo -Joab -Joachim -Joao -Joaquin -Job -Jock -Jodi -Jodie -Jody -Joe -Joel -Joey -Johan -Johann -Johannes -John -John-David -John-Patrick -Johnathan -Johnathon -Johnnie -Johnny -Johny -Jon -Jonah -Jonas -Jonathan -Jonathon -Jonny -Jordan -Jordon -Jordy -Jorge -Jory -Jose -Josef -Joseph -Josephus -Josh -Joshua -Joshuah -Josiah -Jotham -Juan -Juanita -Jud -Judah -Judas -Judd -Jude -Judith -Judson -Judy -Juergen -Jule -Jules -Julian -Julie -Julio -Julius -Justin -Justis -Kaiser -Kaleb -Kalil -Kalle -Kalman -Kalvin -Kam -Kane -Kareem -Karel -Karim -Karl -Karsten -Kaspar -Keefe -Keenan -Keene -Keil -Keith -Kellen -Kelley -Kelly -Kelsey -Kelvin -Kelwin -Ken -Kendal -Kendall -Kendrick -Kenn -Kennedy -Kenneth -Kenny -Kent -Kenton -Kenyon -Kermie -Kermit -Kerry -Kevan -Kevin -Kim -Kimball -Kimmo -Kin -Kincaid -King -Kingsley -Kingsly -Kingston -Kip -Kirby -Kirk -Kit -Klaus -Klee -Knox -Konrad -Konstantin -Kory -Kostas -Kraig -Kris -Krishna -Kristian -Kristopher -Kristos -Kurt -Kurtis -Kyle -Laird -Lamar -Lambert -Lamont -Lance -Lancelot -Lane -Langston -Lanny -Larry -Lars -Laurance -Lauren -Laurence -Laurens -Laurent -Laurie -Lawerence -Lawrence -Lawson -Lawton -Lay -Layton -Lazar -Lazare -Lazaro -Lazarus -Lazlo -Lee -Lefty -Leif -Leigh -Leighton -Leland -Lem -Lemar -Lemmie -Lemmy -Lemuel -Len -Lenard -Lennie -Lenny -Leo -Leon -Leonard -Leonardo -Leonerd -Leonhard -Leonid -Leonidas -Leopold -Leroy -Les -Lesley -Leslie -Lester -Lev -Levi -Levin -Levon -Levy -Lew -Lewis -Lex -Liam -Lin -Lincoln -Lind -Lindsay -Lindsey -Lindy -Linoel -Linus -Lion -Lionel -Lionello -Llewellyn -Lloyd -Locke -Lockwood -Logan -Lon -Lonnie -Lonny -Loren -Lorenzo -Lorne -Lorrie -Lothar -Lou -Louie -Louis -Lovell -Lowell -Lucas -Luce -Lucian -Luciano -Lucien -Lucio -Lucius -Ludvig -Ludwig -Luigi -Luis -Lukas -Luke -Luther -Lyle -Lyn -Lyndon -Lynn -Mac -Mace -Mack -Mackenzie -Maddie -Maddy -Madison -Magnum -Magnus -Mahesh -Mahmoud -Mahmud -Maison -Major -Malcolm -Manfred -Manish -Manny -Manuel -Marc -Marcel -Marcello -Marcellus -Marcelo -Marchall -Marcio -Marco -Marcos -Marcus -Marietta -Marilu -Mario -Marion -Marius -Mark -Marko -Markos -Markus -Marlin -Marlo -Marlon -Marlow -Marlowe -Marmaduke -Marsh -Marshal -Marshall -Mart -Martainn -Marten -Martie -Martin -Martino -Marty -Martyn -Marv -Marve -Marven -Marvin -Marwin -Mason -Mateo -Mathew -Mathias -Matias -Matt -Matteo -Matthaeus -Mattheus -Matthew -Matthias -Matthieu -Matthiew -Matthus -Mattias -Mattie -Matty -Maurice -Mauricio -Maurie -Maurise -Maurits -Mauritz -Maury -Max -Maxfield -Maxie -Maxim -Maximilian -Maximilien -Maxwell -Mayer -Maynard -Maynord -Mayor -Mead -Meade -Meier -Meir -Mel -Melvin -Melvyn -Menard -Mendel -Mendie -Meredeth -Meredith -Merell -Merill -Merle -Merlin -Merrel -Merrick -Merril -Merrill -Merry -Merv -Mervin -Merwin -Meryl -Meyer -Mic -Micah -Michael -Michail -Michal -Michale -Micheal -Micheil -Michel -Michele -Mick -Mickey -Mickie -Micky -Miguel -Mika -Mikael -Mike -Mikel -Mikey -Mikhail -Miles -Millicent -Milo -Milt -Milton -Mischa -Mitch -Mitchael -Mitchel -Mitchell -Moe -Mohamad -Mohamed -Mohammad -Mohammed -Mohan -Moise -Moises -Moishe -Monroe -Montague -Monte -Montgomery -Monty -Moore -Mordecai -Morgan -Morlee -Morley -Morly -Morrie -Morris -Morry -Morse -Mort -Morten -Mortie -Mortimer -Morton -Morty -Mose -Moses -Moshe -Moss -Muffin -Mugsy -Muhammad -Munmro -Munroe -Murdoch -Murdock -Murphy -Murray -Mustafa -Myke -Myles -Mylo -Myron -Nahum -Napoleon -Nat -Natale -Nate -Nathan -Nathanael -Nathanial -Nathaniel -Nathanil -Neal -Neale -Neall -Nealon -Nealson -Nealy -Ned -Neddie -Neddy -Neel -Neil -Nels -Nelsen -Nelson -Nero -Neron -Nester -Nestor -Nev -Nevil -Nevile -Neville -Nevin -Nevins -Newton -Niall -Niccolo -Nicholas -Nichole -Nichols -Nick -Nickey -Nickie -Nickolas -Nicky -Nico -Nicolas -Niels -Nigel -Niki -Nikita -Nikki -Nikolai -Nikos -Niles -Nils -Nilson -Niven -Noach -Noah -Noam -Noble -Noe -Noel -Nolan -Noland -Norbert -Norm -Norman -Normand -Normie -Norris -Northrop -Northrup -Norton -Norwood -Nunzio -Obadiah -Obadias -Oberon -Obie -Octavius -Odell -Odie -Odin -Odysseus -Olaf -Olag -Ole -Oleg -Olin -Oliver -Olivier -Olle -Ollie -Omar -Oral -Oran -Orazio -Orbadiah -Oren -Orin -Orion -Orlando -Orren -Orrin -Orson -Orton -Orville -Osbert -Osborn -Osborne -Osbourn -Osbourne -Oscar -Osgood -Osmond -Osmund -Ossie -Oswald -Oswell -Otes -Othello -Otho -Otis -Otto -Owen -Ozzie -Ozzy -Pablo -Pace -Paco -Paddie -Paddy -Padraig -Page -Paige -Pail -Palmer -Paolo -Park -Parke -Parker -Parnell -Parrnell -Parry -Parsifal -Partha -Pascal -Pascale -Pasquale -Pat -Pate -Patel -Paten -Patin -Paton -Patric -Patrice -Patricio -Patrick -Patrik -Patsy -Pattie -Patty -Paul -Paulo -Pavel -Pearce -Pedro -Peirce -Pembroke -Pen -Penn -Pennie -Penny -Penrod -Pepe -Pepillo -Pepito -Perceval -Percival -Percy -Perry -Pete -Peter -Petey -Petr -Peyter -Peyton -Phil -Philbert -Philip -Phillip -Phillipe -Phillipp -Phineas -Phip -Pierce -Pierre -Pierson -Piet -Pieter -Pietro -Piggy -Pincas -Pinchas -Pincus -Piotr -Pip -Plato -Pooh -Porter -Poul -Powell -Praneetf -Prasad -Prasun -Prent -Prentice -Prentiss -Prescott -Preston -Price -Prince -Pryce -Puff -Purcell -Putnam -Pyotr -Quent -Quentin -Quiggly -Quigly -Quigman -Quill -Quillan -Quincey -Quincy -Quinlan -Quinn -Quint -Quintin -Quinton -Quintus -Rab -Rabbi -Rabi -Rad -Radcliffe -Rafael -Rafe -Ragnar -Rahul -Raimund -Rainer -Raj -Rajeev -Raleigh -Ralf -Ralph -Ram -Ramesh -Ramon -Ramsay -Ramsey -Rand -Randal -Randall -Randell -Randi -Randie -Randolf -Randolph -Randy -Ransell -Ransom -Raoul -Raphael -Raul -Ravi -Ravil -Rawley -Ray -Raymond -Raymund -Raymundo -Raynard -Rayner -Raynor -Reagan -Red -Redford -Redmond -Reece -Reed -Rees -Reese -Reg -Regan -Regen -Reggie -Reggis -Reggy -Reginald -Reginauld -Reid -Reilly -Reinhard -Reinhold -Rem -Remington -Remus -Renado -Renaldo -Renard -Renato -Renaud -Renault -Rene -Reube -Reuben -Reuven -Rex -Rey -Reynard -Reynold -Reynolds -Reza -Rhett -Ric -Ricard -Ricardo -Riccardo -Rice -Rich -Richard -Richardo -Richie -Richmond -Richy -Rick -Rickard -Rickey -Ricki -Rickie -Ricky -Rik -Rikki -Riley -Rinaldo -Ripley -Ritch -Ritchie -Roarke -Rob -Robb -Robbert -Robbie -Robert -Roberto -Robin -Robinson -Rochester -Rock -Rockwell -Rocky -Rod -Rodd -Roddie -Roddy -Roderic -Roderich -Roderick -Roderigo -Rodge -Rodger -Rodney -Rodolfo -Rodolph -Rodolphe -Rodrick -Rodrigo -Rodrique -Rog -Roger -Rogers -Roice -Roland -Rolando -Rolf -Rolfe -Rolland -Rollin -Rollins -Rollo -Rolph -Romain -Roman -Romeo -Ron -Ronald -Ronen -Roni -Ronnie -Ronny -Roosevelt -Rory -Roscoe -Ross -Roth -Rourke -Rowland -Roy -Royal -Royce -Rube -Ruben -Rubin -Ruby -Rudd -Ruddie -Ruddy -Rudie -Rudiger -Rudolf -Rudolfo -Rudolph -Rudy -Rudyard -Rufe -Rufus -Rupert -Ruperto -Russ -Russel -Russell -Rustie -Rustin -Rusty -Rutger -Rutherford -Rutledge -Rutter -Ryan -Sal -Salem -Salim -Salman -Salmon -Salomo -Salomon -Salomone -Salvador -Salvatore -Salvidor -Sam -Sammie -Sammy -Sampson -Samson -Samuel -Samuele -Sancho -Sander -Sanders -Sanderson -Sandor -Sandro -Sandy -Sanford -Sanson -Sansone -Sarge -Sargent -Sascha -Sasha -Saul -Sauncho -Saunder -Saunders -Saunderson -Saundra -Saw -Sawyer -Sawyere -Sax -Saxe -Saxon -Say -Sayer -Sayers -Sayre -Sayres -Scarface -Schroeder -Schuyler -Scot -Scott -Scotti -Scottie -Scotty -Seamus -Sean -Sebastian -Sebastiano -Sebastien -See -Selby -Selig -Serge -Sergeant -Sergei -Sergent -Sergio -Seth -Seymour -Shadow -Shaine -Shalom -Shamus -Shanan -Shane -Shannan -Shannon -Shaughn -Shaun -Shaw -Shawn -Shay -Shayne -Shea -Sheff -Sheffie -Sheffield -Sheffy -Shelby -Shelden -Sheldon -Shell -Shelley -Shelton -Shem -Shep -Shepard -Shepherd -Sheppard -Shepperd -Sheridan -Sherlock -Sherlocke -Sherman -Sherwin -Sherwood -Sherwynd -Shimon -Shlomo -Sholom -Shorty -Shumeet -Shurlock -Shurlocke -Shurwood -Si -Sibyl -Sid -Siddhartha -Sidnee -Sidney -Siegfried -Siffre -Sig -Sigfrid -Sigfried -Sigmund -Silas -Silvain -Silvan -Silvano -Silvanus -Silvester -Silvio -Sim -Simeon -Simmonds -Simon -Simone -Sinclair -Sinclare -Sivert -Siward -Skell -Skelly -Skip -Skipp -Skipper -Skippie -Skippy -Skipton -Sky -Skye -Skylar -Skyler -Slade -Slim -Sloan -Sloane -Sly -Smith -Smitty -Socrates -Sol -Sollie -Solly -Solomon -Somerset -Son -Sonnie -Sonny -Sparky -Spence -Spencer -Spense -Spenser -Spike -Spiro -Spiros -Spud -Srinivas -Stacy -Staffard -Stafford -Staford -Stan -Standford -Stanfield -Stanford -Stanislaw -Stanleigh -Stanley -Stanly -Stanton -Stanwood -Stavros -Stearn -Stearne -Stefan -Stefano -Steffen -Stephan -Stephanus -Stephen -Sterling -Stern -Sterne -Steve -Steven -Stevie -Stevy -Stew -Steward -Stewart -Stig -Stillman -Stillmann -Sting -Stinky -Stirling -Stu -Stuart -Sturgis -Sullivan -Sully -Sumner -Sunny -Sutherland -Sutton -Sven -Swen -Syd -Sydney -Sylvan -Sylvester -Tab -Tabb -Tabbie -Tabby -Taber -Tabor -Tad -Tadd -Taddeo -Taddeus -Tadeas -Tailor -Tait -Taite -Talbert -Talbot -Tallie -Tally -Tam -Tamas -Tammie -Tammy -Tan -Tann -Tanner -Tanney -Tannie -Tanny -Tarrance -Tarrant -Tarzan -Tate -Taylor -Teador -Ted -Tedd -Teddie -Teddy -Tedie -Tedman -Tedmund -Tedrick -Temp -Temple -Templeton -Teodoor -Teodor -Teodorico -Teodoro -Terence -Terencio -Terrance -Terrel -Terrell -Terrence -Terri -Terrill -Terry -Thacher -Thad -Thaddeus -Thaddius -Thaddus -Thadeus -Thain -Thaine -Thane -Tharen -Thatch -Thatcher -Thaxter -Thayne -Thebault -Thedric -Thedrick -Theo -Theobald -Theodor -Theodore -Theodoric -Theophyllus -Thibaud -Thibaut -Thom -Thomas -Thor -Thorn -Thorndike -Thornie -Thornton -Thorny -Thorpe -Thorstein -Thorsten -Thorvald -Thurstan -Thurston -Tibold -Tiebold -Tiebout -Tiler -Tim -Timmie -Timmy -Timothee -Timotheus -Timothy -Tirrell -Tito -Titos -Titus -Tobe -Tobiah -Tobias -Tobie -Tobin -Tobit -Toby -Tod -Todd -Toddie -Toddy -Tom -Tomas -Tome -Tomkin -Tomlin -Tommie -Tommy -Tonnie -Tony -Tore -Torey -Torin -Torr -Torrance -Torre -Torrence -Torrey -Torrin -Torry -Town -Towney -Townie -Townsend -Towny -Trace -Tracey -Tracie -Tracy -Traver -Travers -Travis -Tray -Tre -Tremain -Tremaine -Tremayne -Trent -Trenton -Trev -Trevar -Trever -Trevor -Trey -Trip -Tristan -Troy -Truman -Tuck -Tucker -Tuckie -Tucky -Tudor -Tull -Tulley -Tully -Turner -Ty -Tybalt -Tye -Tyler -Tymon -Tymothy -Tynan -Tyrone -Tyrus -Tyson -Udale -Udall -Udell -Ugo -Ulberto -Uli -Ulick -Ulises -Ulric -Ulrich -Ulrick -Ulysses -Umberto -Upton -Urbain -Urban -Urbano -Urbanus -Uri -Uriah -Uriel -Urson -Vachel -Vaclav -Vail -Val -Valdemar -Vale -Valentin -Valentine -Van -Vance -Vasili -Vasilis -Vasily -Vassili -Vassily -Vaughan -Vaughn -Venkat -Verge -Vergil -Vern -Verne -Vernen -Verney -Vernon -Vernor -Vibhu -Vic -Vick -Victor -Vijay -Vilhelm -Vin -Vince -Vincent -Vincents -Vinnie -Vinny -Vinod -Virge -Virgie -Virgil -Virgilio -Vite -Vito -Vlad -Vladamir -Vladimir -Voltaire -Von -Wade -Wadsworth -Wain -Waine -Wainwright -Wait -Waite -Waiter -Wake -Wakefield -Wald -Waldemar -Walden -Waldo -Waldon -Waleed -Walker -Wallace -Wallache -Wallas -Wallie -Wallis -Wally -Walsh -Walt -Walter -Walther -Walton -Wang -Ward -Warde -Warden -Ware -Waring -Warner -Warren -Wash -Washington -Wat -Waverley -Waverly -Way -Waylan -Wayland -Waylen -Waylin -Waylon -Wayne -Web -Webb -Weber -Webster -Weidar -Weider -Welbie -Welby -Welch -Wells -Welsh -Wendall -Wendel -Wendell -Werner -Wes -Wesley -Weslie -West -Westbrook -Westbrooke -Westleigh -Westley -Weston -Weylin -Wheeler -Whit -Whitaker -Whitby -Whitman -Whitney -Whittaker -Wiatt -Wilber -Wilbert -Wilbur -Wilburn -Wilburt -Wilden -Wildon -Wilek -Wiley -Wilfred -Wilfrid -Wilhelm -Will -Willard -Willdon -Willem -Willey -Willi -William -Willie -Willis -Willmott -Willy -Wilmar -Wilmer -Wilson -Wilt -Wilton -Win -Windham -Winfield -Winford -Winfred -Winifield -Winn -Winnie -Winny -Winslow -Winston -Winthrop -Winton -Wit -Witold -Wittie -Witty -Wojciech -Wolf -Wolfgang -Wolfie -Wolfram -Wolfy -Woochang -Wood -Woodie -Woodman -Woodrow -Woody -Worden -Worth -Worthington -Worthy -Wright -Wyatan -Wyatt -Wye -Wylie -Wyn -Wyndham -Wynn -Wynton -Xavier -Xenos -Xerxes -Xever -Ximenes -Ximenez -Xymenes -Yaakov -Yacov -Yale -Yanaton -Yance -Yancey -Yancy -Yank -Yankee -Yard -Yardley -Yehudi -Yigal -Yule -Yuri -Yves -Zach -Zacharia -Zachariah -Zacharias -Zacharie -Zachary -Zacherie -Zachery -Zack -Zackariah -Zak -Zalman -Zane -Zared -Zary -Zeb -Zebadiah -Zebedee -Zebulen -Zebulon -Zechariah -Zed -Zedekiah -Zeke -Zelig -Zerk -Zeus -Zippy -Zollie -Zolly -Zorro diff --git a/exercises/02450Toolbox_Python/Data/messy_data/README.txt b/exercises/02450Toolbox_Python/Data/messy_data/README.txt deleted file mode 100644 index c5e842207f409b4e9e16496eaaf9ff590964932c..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/messy_data/README.txt +++ /dev/null @@ -1,70 +0,0 @@ -*************************************************************************** -*************************************************************************** -*** messy_data *** -*************************************************************************** -*************************************************************************** -This dataset is an adaption of an existing dataset to highlight some common -issues (or variants of them) that one might face across various datasets. -This is not real data, but is based on values from the Auto-Mpg Data. -The original data was obtained from: - https://archive.ics.uci.edu/ml/datasets/auto+mpg -but was modified to include some formatting issues as well as removing some -values. -Missing values in the original dataset were sometimes denoted -with a question mark. Some missing values were introduced, too. -Specifically zeroes in the attributes mpg and displacement can be -considered missing values. - -For reference, the description of the original dataset is provided below. - -*************************************************************************** -*************************************************************************** -*** Original dataset description *** -*************************************************************************** -*************************************************************************** -1. Title: Auto-Mpg Data - -2. Sources: - (a) Origin: This dataset was taken from the StatLib library which is - maintained at Carnegie Mellon University. The dataset was - used in the 1983 American Statistical Association Exposition. - (c) Date: July 7, 1993 - -3. Past Usage: - - See 2b (above) - - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. - In Proceedings on the Tenth International Conference of Machine - Learning, 236-243, University of Massachusetts, Amherst. Morgan - Kaufmann. - -4. Relevant Information: - - This dataset is a slightly modified version of the dataset provided in - the StatLib library. In line with the use by Ross Quinlan (1993) in - predicting the attribute "mpg", 8 of the original instances were removed - because they had unknown values for the "mpg" attribute. The original - dataset is available in the file "auto-mpg.data-original". - - "The data concerns city-cycle fuel consumption in miles per gallon, - to be predicted in terms of 3 multivalued discrete and 5 continuous - attributes." (Quinlan, 1993) - -5. Number of Instances: 398 - -6. Number of Attributes: 9 including the class attribute - -7. Attribute Information: - - 1. mpg: continuous - 2. cylinders: multi-valued discrete - 3. displacement: continuous - 4. horsepower: continuous - 5. weight: continuous - 6. acceleration: continuous - 7. model year: multi-valued discrete - 8. origin: multi-valued discrete - 9. car name: string (unique for each instance) - -8. Missing Attribute Values: horsepower has 6 missing values - - diff --git a/exercises/02450Toolbox_Python/Data/messy_data/messy_data.data b/exercises/02450Toolbox_Python/Data/messy_data/messy_data.data deleted file mode 100644 index f62d8f48d5f6682e03b68e209c73ce99b0b4bf13..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/messy_data/messy_data.data +++ /dev/null @@ -1,33 +0,0 @@ -messy_data -mpg cylinders displacement horsepower weight acceleration modelyear origin carname -mpg cyl disp hp w acc yr org name -18 8 ? 130 3'504 12.0 70 1 chevrolet chevelle malibu -15 8 350 165 3'693 11,5 70 1 buick skylark 320 -18 8 ? 150 3'436 11.0 70 1 plymouth satellite -16 8 ? 150 3'433 12.0 70 1 amc rebel sst -17 8 0 140 3'449 10,5 70 1 ford torino -15 8 429 198 4'341 10.0 70 1 ford galaxie 500 -14 8 454 220 4'354 9.0 70 1 chevrolet impala -14 8 ? 215 4312 8,5 70 1 plymouth fury iii -14 8 455 225 4425 10.0 70 1 pontiac catalina -15 8 390 190 3'850 8,5 70 1 amc ambassador dpl -15 8 0 170 3'563 10.0 70 1 dodge challenger se -14 8 ? 160 3'609 8.0 70 1 plymouth 'cuda 340 -99 8 ? 150 3'761 9,5 70 1 chevrolet monte carlo -14 8 ? 225 3'086 10.0 70 1 buick estate wagon (sw) -24 4 113 95 2'372 15.0 70 3 toyota corona mark ii -22 6 95 2'833 15,5 70 1 plymouth duster -0 6 199 97 2'774 15,5 70 1 amc hornet -21 6 ? 85 2'587 16.0 70 1 ford maverick -27 4 97 88 2'130 14,5 70 3 datsun pl510 -26 4 46 1'835 20,5 70 2 volkswagen 1131 deluxe sedan -33 4 105 74 2190 14.2 81 2 volkswagen jetta -33.7 4 107 75 2210 14.4 81 3 honda prelude -32.4 4 108 75 2350 16.8 81 3 toyota corolla -32.9 4 119 100 2615 14.8 81 3 datsun 200sx -31.6 4 120 74 2635 18.3 81 3 mazda 626 -28.1 4 141 80 3'230 20.4 81 2 peugeot 505s turbo diesel -30.7 6 145 76 3'160 19.6 81 2 volvo diesel -0 6 168 116 2'900 12.6 81 3 toyota cressida -24.2 6 146 120 2'930 13.8 81 3 datsun 810 maxima - diff --git a/exercises/02450Toolbox_Python/Data/nanonose.xls b/exercises/02450Toolbox_Python/Data/nanonose.xls deleted file mode 100644 index e13e95be533adaa8792bd648f8b1e020f8684e9e..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/nanonose.xls and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/readme_male_female_data.txt b/exercises/02450Toolbox_Python/Data/readme_male_female_data.txt deleted file mode 100644 index 19518acc6c787f993164f4e4254069f90c1cd637..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/readme_male_female_data.txt +++ /dev/null @@ -1,12 +0,0 @@ -You may use the lists of names for any purpose, so long as credit is given -in any published work. You may also redistribute the list if you -provide the recipients with a copy of this README file. The lists are -not in the public domain (I retain the copyright on the lists) but are -freely redistributable. - -If you have any additions to the lists of names, I would appreciate -receiving them. - -My email address is mkant+@cs.cmu.edu. - -Mark Kantrowitz diff --git a/exercises/02450Toolbox_Python/Data/stopWords.txt b/exercises/02450Toolbox_Python/Data/stopWords.txt deleted file mode 100644 index 2d40fc0238119eaf8c9d2b86c3e7cb9f5e920c1e..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/stopWords.txt +++ /dev/null @@ -1,439 +0,0 @@ -a -about -above -accordingly -across -after -afterwards -again -against -all -allows -almost -alone -along -already -also -although -always -am -among -amongst -an -and -another -any -anybody -anyhow -anyone -anything -anywhere -apart -appear -appropriate -are -around -as -aside -associated -at -available -away -awfully -b -back -be -became -because -become -becomes -becoming -been -before -beforehand -behind -being -below -beside -besides -best -better -between -beyond -both -brief -but -by -c -came -can -cannot -cant -cause -causes -certain -changes -co -come -consequently -contain -containing -contains -corresponding -could -currently -d -day -described -did -different -do -does -doing -done -down -downwards -during -e -each -eg -eight -either -else -elsewhere -enough -et -etc -even -ever -every -everybody -everyone -everything -everywhere -ex -example -except -f -far -few -fifth -first -five -followed -following -for -former -formerly -forth -four -from -further -furthermore -g -get -gets -given -gives -go -gone -good -got -great -h -had -hardly -has -have -having -he -hence -her -here -hereafter -hereby -herein -hereupon -hers -herself -him -himself -his -hither -how -howbeit -however -i -ie -if -ignored -immediate -in -inasmuch -inc -indeed -indicate -indicated -indicates -inner -insofar -instead -into -inward -is -it -its -itself -j -just -k -keep -kept -know -l -last -latter -latterly -least -less -lest -life -like -little -long -ltd -m -made -make -man -many -may -me -meanwhile -men -might -more -moreover -most -mostly -mr -much -must -my -myself -n -name -namely -near -necessary -neither -never -nevertheless -new -next -nine -no -nobody -none -noone -nor -normally -not -nothing -novel -now -nowhere -o -of -off -often -oh -old -on -once -one -ones -only -onto -or -other -others -otherwise -ought -our -ours -ourselves -out -outside -over -overall -own -p -particular -particularly -people -per -perhaps -placed -please -plus -possible -probably -provides -q -que -quite -r -rather -really -relatively -respectively -right -s -said -same -second -secondly -see -seem -seemed -seeming -seems -self -selves -sensible -sent -serious -seven -several -shall -she -should -since -six -so -some -somebody -somehow -someone -something -sometime -sometimes -somewhat -somewhere -specified -specify -specifying -state -still -sub -such -sup -t -take -taken -than -that -the -their -theirs -them -themselves -then -thence -there -thereafter -thereby -therefore -therein -thereupon -these -they -third -this -thorough -thoroughly -those -though -three -through -throughout -thru -thus -time -to -together -too -toward -towards -twice -two -u -under -unless -until -unto -up -upon -us -use -used -useful -uses -using -usually -v -value -various -very -via -viz -vs -w -was -way -we -well -went -were -what -whatever -when -whence -whenever -where -whereafter -whereas -whereby -wherein -whereupon -wherever -whether -which -while -whither -who -whoever -whole -whom -whose -why -will -with -within -without -work -world -would -x -y -year -years -yet -you -your -yours -yourself -yourselves -z -zero \ No newline at end of file diff --git a/exercises/02450Toolbox_Python/Data/synth1.mat b/exercises/02450Toolbox_Python/Data/synth1.mat deleted file mode 100644 index 4eb623f831660cf7dbb0a655fc03ebb023b11442..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/synth1.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/synth2.mat b/exercises/02450Toolbox_Python/Data/synth2.mat deleted file mode 100644 index 99838d2eb7e3a60799dbb54dfc5a98db5b6fc8aa..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/synth2.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/synth3.mat b/exercises/02450Toolbox_Python/Data/synth3.mat deleted file mode 100644 index adefbcfc4075497a710c1e90fa65e4933fc02f9e..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/synth3.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/synth4.mat b/exercises/02450Toolbox_Python/Data/synth4.mat deleted file mode 100644 index 8a445f9c8fe5bb16f82b4312d2cb5e8051022c29..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/synth4.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/synth5.mat b/exercises/02450Toolbox_Python/Data/synth5.mat deleted file mode 100644 index e36f9391ba3707db3773a38ee5057fbaf3807424..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/synth5.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/synth6.mat b/exercises/02450Toolbox_Python/Data/synth6.mat deleted file mode 100644 index 3857ca1a9eff1a851e14746d63ea0b6522d961c6..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/synth6.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/synth7.mat b/exercises/02450Toolbox_Python/Data/synth7.mat deleted file mode 100644 index 1b421f60e9099fa77dcb3b9431f91a0587af016e..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/synth7.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/textDocs.txt b/exercises/02450Toolbox_Python/Data/textDocs.txt deleted file mode 100644 index 774880a8da78e7101ff3aaef5f76bd9487b34771..0000000000000000000000000000000000000000 --- a/exercises/02450Toolbox_Python/Data/textDocs.txt +++ /dev/null @@ -1,13 +0,0 @@ -The Google matrix P is a model of the internet - - -P_ij is nonzero if there is a link from webpage i to j - - -The Google matrix is used to rank all Web pages - - -The ranking is done by solving a matrix eigenvalue problem - - -England dropped out of the top 10 in the FIFA ranking \ No newline at end of file diff --git a/exercises/02450Toolbox_Python/Data/wildfaces.mat b/exercises/02450Toolbox_Python/Data/wildfaces.mat deleted file mode 100644 index 1f5894a43da3a0b7c7549d8937968a6488f8ca92..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/wildfaces.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/wildfaces_grayscale.mat b/exercises/02450Toolbox_Python/Data/wildfaces_grayscale.mat deleted file mode 100644 index bcab41e4da0ed2d6b988ed73eff4eb15b6408c2f..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/wildfaces_grayscale.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/wine.mat b/exercises/02450Toolbox_Python/Data/wine.mat deleted file mode 100644 index 3951017853d00623d8e761b42ae6dee9f222b05b..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/wine.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/wine2.mat b/exercises/02450Toolbox_Python/Data/wine2.mat deleted file mode 100644 index c6851a61390f2cdc3c694654aededa5761d78cd2..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/wine2.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/xor.mat b/exercises/02450Toolbox_Python/Data/xor.mat deleted file mode 100644 index ffef7df14ee096007810041e7efa42ea0f7f98da..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/xor.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/Data/zipdata.mat b/exercises/02450Toolbox_Python/Data/zipdata.mat deleted file mode 100644 index a98e79614d23893b45b1bb554f82db03b59a98ad..0000000000000000000000000000000000000000 Binary files a/exercises/02450Toolbox_Python/Data/zipdata.mat and /dev/null differ diff --git a/exercises/02450Toolbox_Python/README.md b/exercises/02450Toolbox_Python/README.md index 319d3b2fbfff3dd6a00d1cb3cffe158579645c5b..be5dc078152332aa1a3862283f44e6df5be4ea16 100644 --- a/exercises/02450Toolbox_Python/README.md +++ b/exercises/02450Toolbox_Python/README.md @@ -1,13 +1,15 @@ # 02450 Toolbox - Python ## Installation -The exercise scriprs foudn depend on a +The exercise scripts inside `/Scripts` depend on a course specific package [dtuimldmtools](https://pypi.org/project/dtuimldmtools/) which needs to be installed. -TODO: To be completed -TODO: Virtual envs +We recommend using a Python Virtual environment using [Anaconda](https://www.anaconda.com/download/) or [Miniconda](https://docs.conda.io/projects/miniconda/en/latest/miniconda-install.html) and installing the package inside it. To set up such an environment follow the guide provided by [DTU Python support](https://pythonsupport.dtu.dk/python/install-conda.html). -pip install 02450toolbox -import toolbox_02450 +Once setup, the package can be installed by running the following command: + +``` +pip install dtuimldmtools +``` ## Dataset diff --git a/exercises/02450Toolbox_Python/Scripts/ex0_4_3.py b/exercises/02450Toolbox_Python/Scripts/ex0_4_3.py index 65524d527ee0bff742ff847d7193b6079c586045..7e13f8905f5024b2229bea5b6a242bd2589d4fa0 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex0_4_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex0_4_3.py @@ -1,24 +1,24 @@ ## exercise 0.4.3 -# In Python you need to 'import' packages and external functions before you can -# use them. We can import NumPy (which enables us to work with matrices, among -# other things) by writing 'import numpy as np'. +# In Python you need to 'import' packages and external functions before you can +# use them. We can import NumPy (which enables us to work with matrices, among +# other things) by writing 'import numpy as np'. # We load the package into the ``namespace'' np to reference it easily, # now we can write 'np.sum(X)' instead of 'numpy.sum(X)'. -import numpy as np +import numpy as np # Remember you can mark a part of the code and press # F9 to run that part alone. # define variable a with numbers in the range from 0 to 7 (not inclusive) -a = np.arange(start=0,stop=7) +a = np.arange(start=0, stop=7) # define variable b with numbers in the range from 2 to 17 in steps of 4 -b = np.arange(start=2,stop=17,step=4) +b = np.arange(start=2, stop=17, step=4) # similar to b but without explicit decleration of the input arguments names -c = np.arange(100, 95, -1) +c = np.arange(100, 95, -1) -d = np.arange(1.2, 1.9, 0.1) +d = np.arange(1.2, 1.9, 0.1) -e = np.pi*np.arange(0,2.5,.5) +e = np.pi * np.arange(0, 2.5, 0.5) diff --git a/exercises/02450Toolbox_Python/Scripts/ex0_4_4.py b/exercises/02450Toolbox_Python/Scripts/ex0_4_4.py index 937e0e0ef0a53eba23ee4b9b274f09c1a5b315b9..c795bcc0272031a8e58015e1448d85c9c51c727d 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex0_4_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex0_4_4.py @@ -1,29 +1,30 @@ ## exercise 0.4.4 import numpy as np + # Extracting the elements from vectors is easy. Consider the # following definition of x and the echoed results x = np.concatenate([np.zeros(2), np.arange(0, 3.6, 0.6), np.ones(3)]) -x[1:5] # take out elements 2 through 5 (notice 6 is not included) -np.size(x) # return the size of x (equivalent to len(x) since x is an array) -len(x) # return the length of x +x[1:5] # take out elements 2 through 5 (notice 6 is not included) +np.size(x) # return the size of x (equivalent to len(x) since x is an array) +len(x) # return the length of x # Try writing help(len) and help(np.size) -x[-1] # take the last element of x -x[1::2] # return every other element of x starting from the 2nd +x[-1] # take the last element of x +x[1::2] # return every other element of x starting from the 2nd # The length of x is 11; what is x[11] - and why? # Inserting numbers into vectors is also easy. Using the same # definition of x and observe the results when typing -y = x; -y[1::2] = np.pi -# Notice that we're inserting the same scalar value "pi" into all elements +y = x +y[1::2] = np.pi +# Notice that we're inserting the same scalar value "pi" into all elements # that we index y with # You can also try: -#y[1::2] = np.arange(2,12,2) +# y[1::2] = np.arange(2,12,2) # Observe the results when indexing the vector y with -# y[1] and y[0]. Is y[0] defined? \ No newline at end of file +# y[1] and y[0]. Is y[0] defined? diff --git a/exercises/02450Toolbox_Python/Scripts/ex0_4_5.py b/exercises/02450Toolbox_Python/Scripts/ex0_4_5.py index 3b9ff06c71fb2e14620494fe7bbfae1a24812f62..0884bcf6b949d43eb5d2b3371eb10d117d975ed3 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex0_4_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex0_4_5.py @@ -2,97 +2,97 @@ import numpy as np # Setup two ararys -x = np.arange(1,6) -y = np.arange(2,12,2) +x = np.arange(1, 6) +y = np.arange(2, 12, 2) # Have a look at them by typing 'x' and 'y' in the console -# There's a difference between matrix multiplication and elementwise -# multiplication, and specifically in Python its also important if you +# There's a difference between matrix multiplication and elementwise +# multiplication, and specifically in Python its also important if you # are using the multiply operator "*" on an array object or a matrix object! # Use the * operator to multiply the two arrays: -x*y +x * y -# Now, convert the arrays into matrices - -x = np.asmatrix(np.arange(1,6)) -y = np.asmatrix(np.arange(2,12,2)) +# Now, convert the arrays into matrices - +x = np.asmatrix(np.arange(1, 6)) +y = np.asmatrix(np.arange(2, 12, 2)) # Again, have a look at them by typing 'x' and 'y' in the console # Try using the * operator just as before now: -x*y +x * y # You should now get an error - try to explain why. # array and matrix are two data structures added by NumPy package to the list of -# basic data structures in Python (lists, tuples, sets). We shall use both -# array and matrix structures extensively throughout this course, therefore -# make sure that you understand differences between them -# (multiplication, dimensionality) and that you are able to convert them one -# to another (asmatrix(), asarray() functions). -# Generally speaking, array objects are used to represent scientific, numerical, -# N-dimensional data. matrix objects can be very handy when it comes to +# basic data structures in Python (lists, tuples, sets). We shall use both +# array and matrix structures extensively throughout this course, therefore +# make sure that you understand differences between them +# (multiplication, dimensionality) and that you are able to convert them one +# to another (asmatrix(), asarray() functions). +# Generally speaking, array objects are used to represent scientific, numerical, +# N-dimensional data. matrix objects can be very handy when it comes to # algebraic operations on 2-dimensional matrices. # The ambiguity can be circumvented by using explicit function calls: -np.transpose(y) # transposition/transpose of y -y.transpose() # also transpose -y.T # also transpose +np.transpose(y) # transposition/transpose of y +y.transpose() # also transpose +y.T # also transpose -np.multiply(x,y) # element-wise multiplication +np.multiply(x, y) # element-wise multiplication -np.dot(x,y.T) # matrix multiplication -x @ y.T # also matrix multiplication +np.dot(x, y.T) # matrix multiplication +x @ y.T # also matrix multiplication # There are various ways to make certain type of matrices. -a1 = np.array([[1, 2, 3], [4, 5, 6]]) # define explicitly -a2 = np.arange(1,7).reshape(2,3) # reshape range of numbers -a3 = np.zeros([3,3]) # zeros array -a4 = np.eye(3) # diagonal array -a5 = np.random.rand(2,3) # random array -a6 = a1.copy() # copy -a7 = a1 # alias -m1 = np.matrix('1 2 3; 4 5 6; 7 8 9') # define matrix by string -m2 = np.asmatrix(a1.copy()) # copy array into matrix -m3 = np.mat(np.array([1, 2, 3])) # map array onto matrix -a8 = np.asarray(m1) # map matrix onto array - -# It is easy to extract and/or modify selected items from arrays/matrices. +a1 = np.array([[1, 2, 3], [4, 5, 6]]) # define explicitly +a2 = np.arange(1, 7).reshape(2, 3) # reshape range of numbers +a3 = np.zeros([3, 3]) # zeros array +a4 = np.eye(3) # diagonal array +a5 = np.random.rand(2, 3) # random array +a6 = a1.copy() # copy +a7 = a1 # alias +m1 = np.matrix("1 2 3; 4 5 6; 7 8 9") # define matrix by string +m2 = np.asmatrix(a1.copy()) # copy array into matrix +m3 = np.mat(np.array([1, 2, 3])) # map array onto matrix +a8 = np.asarray(m1) # map matrix onto array + +# It is easy to extract and/or modify selected items from arrays/matrices. # Here is how you can index matrix elements: -m = np.matrix('1 2 3; 4 5 6; 7 8 9') -m[0,0] # first element -m[-1,-1] # last element -m[0,:] # first row -m[:,1] # second column -m[1:3,-1] # view on selected rows&columns +m = np.matrix("1 2 3; 4 5 6; 7 8 9") +m[0, 0] # first element +m[-1, -1] # last element +m[0, :] # first row +m[:, 1] # second column +m[1:3, -1] # view on selected rows&columns # Similarly, you can selectively assign values to matrix elements or columns: -m[-1,-1] = 10000 -m[0:2,-1] = np.matrix('100; 1000') -m[:,0] = 0 +m[-1, -1] = 10000 +m[0:2, -1] = np.matrix("100; 1000") +m[:, 0] = 0 -# Logical indexing can be used to change or take only elements that +# Logical indexing can be used to change or take only elements that # fulfil a certain constraint, e.g. -m2[m2>0.5] # display values in m2 that are larger than 0.5 -m2[m2<0.5] = 0 # set all elements that are less than 0.5 to 0 +m2[m2 > 0.5] # display values in m2 that are larger than 0.5 +m2[m2 < 0.5] = 0 # set all elements that are less than 0.5 to 0 -#Below, several examples of common matrix operations, +# Below, several examples of common matrix operations, # most of which we will use in the following weeks. # First, define two matrices: -m1 = 10 * np.mat(np.ones([3,3])) -m2 = np.mat(np.random.rand(3,3)) - -m1+m2 # matrix summation -m1*m2 # matrix product -np.multiply(m1,m2) # element-wise multiplication -m1>m2 # element-wise comparison -m3 = np.hstack((m1,m2)) # combine/concatenate matrices horizontally -# note that this is not equivalent to e.g. +m1 = 10 * np.mat(np.ones([3, 3])) +m2 = np.mat(np.random.rand(3, 3)) + +m1 + m2 # matrix summation +m1 * m2 # matrix product +np.multiply(m1, m2) # element-wise multiplication +m1 > m2 # element-wise comparison +m3 = np.hstack((m1, m2)) # combine/concatenate matrices horizontally +# note that this is not equivalent to e.g. # l = [m1, m2] # in which case l is a list, and l[0] is m1 -m4 = np.vstack((m1,m2)) # combine/concatenate matrices vertically -m3.shape # shape of matrix -m3.mean() # mean value of all the elements -m3.mean(axis=0) # mean values of the columns -m3.mean(axis=1) # mean values of the rows -m3.transpose() # transpose, also: m3.T -m2.I # compute inverse matrix +m4 = np.vstack((m1, m2)) # combine/concatenate matrices vertically +m3.shape # shape of matrix +m3.mean() # mean value of all the elements +m3.mean(axis=0) # mean values of the columns +m3.mean(axis=1) # mean values of the rows +m3.transpose() # transpose, also: m3.T +m2.I # compute inverse matrix diff --git a/exercises/02450Toolbox_Python/Scripts/ex0_5_1.py b/exercises/02450Toolbox_Python/Scripts/ex0_5_1.py index 256c3be27a47f13cd8bf86195349faec2088fab6..6cb5ce885e80e6eca466650c3a6ee45db66da877 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex0_5_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex0_5_1.py @@ -1,13 +1,13 @@ ## exercise 0.5.1 -import numpy as np import matplotlib.pyplot as plt +import numpy as np x = np.arange(0, 1, 0.1) f = np.exp(x) plt.figure(1) plt.plot(x, f) -plt.xlabel('x') -plt.ylabel('f(x)=exp(x)') -plt.title('The exponential function') -plt.show() \ No newline at end of file +plt.xlabel("x") +plt.ylabel("f(x)=exp(x)") +plt.title("The exponential function") +plt.show() diff --git a/exercises/02450Toolbox_Python/Scripts/ex0_5_2.py b/exercises/02450Toolbox_Python/Scripts/ex0_5_2.py index 6da786e297ab9e858e530b37d21f759d0cd77e64..91a435acaa2a28c2edebaf833565bb54746ead5e 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex0_5_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex0_5_2.py @@ -1,53 +1,52 @@ ## exercise 0.5.2 -import numpy as np import matplotlib.pyplot as plt +import numpy as np - -# We simulate measurements every 100 ms for a period of 10 seconds +# We simulate measurements every 100 ms for a period of 10 seconds t = np.arange(0, 10, 0.1) -# The data from the sensors are generated as either a sine or a cosine +# The data from the sensors are generated as either a sine or a cosine # with some Gaussian noise added. -sensor1 = 3*np.sin(t)+0.5*np.random.normal(size=len(t)) -sensor2 = 3*np.cos(t)+0.5*np.random.normal(size=len(t)) +sensor1 = 3 * np.sin(t) + 0.5 * np.random.normal(size=len(t)) +sensor2 = 3 * np.cos(t) + 0.5 * np.random.normal(size=len(t)) # Change the font size to make axis and title readable font_size = 15 -plt.rcParams.update({'font.size': font_size}) +plt.rcParams.update({"font.size": font_size}) # Define the name of the curves -legend_strings = ['Sensor 1', 'Sensor 2'] +legend_strings = ["Sensor 1", "Sensor 2"] # Start plotting the simulated measurements plt.figure(1) # Plot the sensor 1 output as a function of time, and # make the curve red and fully drawn -plt.plot(t, sensor1, 'r-') +plt.plot(t, sensor1, "r-") # Plot the sensor 2 output as a function of time, and # make the curve blue and dashed -plt.plot(t, sensor2, 'b--') +plt.plot(t, sensor2, "b--") -# Ensure that the limits on the axis fit the data -plt.axis('tight') +# Ensure that the limits on the axis fit the data +plt.axis("tight") # Add a grid in the background plt.grid() # Add a legend describing each curve, place it at the "best" location # so as to minimize the amount of curve it covers -plt.legend(legend_strings,loc='best') +plt.legend(legend_strings, loc="best") # Add labels to the axes -plt.xlabel('Time [s]') -plt.ylabel('Voltage [mV]') +plt.xlabel("Time [s]") +plt.ylabel("Voltage [mV]") # Add a title to the plot -plt.title('Sensor outputs') +plt.title("Sensor outputs") # Export the figure -plt.savefig('ex1_5_2.png') +plt.savefig("ex1_5_2.png") # Show the figure in the console plt.show() diff --git a/exercises/02450Toolbox_Python/Scripts/ex10_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex10_1_1.py index 346b6f5efa66cdbbd97418cdfeb07b8e47f21b9b..798eeb6834f374334b840904054bc4a594fdc3f5 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex10_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex10_1_1.py @@ -1,15 +1,19 @@ # exercise 10.1.1 +import importlib_resources from matplotlib.pyplot import figure, show from scipy.io import loadmat -from toolbox_02450 import clusterplot from sklearn.cluster import k_means +from dtuimldmtools import clusterplot + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth1.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) @@ -17,11 +21,11 @@ C = len(classNames) K = 4 # K-means clustering: -centroids, cls, inertia = k_means(X,K) - +centroids, cls, inertia = k_means(X, K) + # Plot results: -figure(figsize=(14,9)) +figure(figsize=(14, 9)) clusterplot(X, cls, centroids, y) show() -print('Ran Exercise 10.1.1') \ No newline at end of file +print("Ran Exercise 10.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py index e3ecd5d6e6364cb194af66b24e899b8ff72bd1ec..c66a9b557b92caede6af2b051105199be4fb3aa4 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex10_1_3.py @@ -1,17 +1,21 @@ - # exercise 10.1.3 -from matplotlib.pyplot import figure, title, plot, ylim, legend, show +# exercise 10.1.3 +import importlib_resources import numpy as np +from matplotlib.pyplot import figure, legend, plot, show, title, ylim from scipy.io import loadmat -from toolbox_02450 import clusterval from sklearn.cluster import k_means +from dtuimldmtools import clusterval + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth1.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) @@ -20,25 +24,25 @@ C = len(classNames) K = 10 # Allocate variables: -Rand = np.zeros((K-1,)) -Jaccard = np.zeros((K-1,)) -NMI = np.zeros((K-1,)) +Rand = np.zeros((K - 1,)) +Jaccard = np.zeros((K - 1,)) +NMI = np.zeros((K - 1,)) -for k in range(K-1): +for k in range(K - 1): # run K-means clustering: - #cls = Pycluster.kcluster(X,k+1)[0] - centroids, cls, inertia = k_means(X,k+2) + # cls = Pycluster.kcluster(X,k+1)[0] + centroids, cls, inertia = k_means(X, k + 2) # compute cluster validities: - Rand[k], Jaccard[k], NMI[k] = clusterval(y,cls) - + Rand[k], Jaccard[k], NMI[k] = clusterval(y, cls) + # Plot results: figure(1) -title('Cluster validity') -plot(np.arange(K-1)+2, Rand) -plot(np.arange(K-1)+2, Jaccard) -plot(np.arange(K-1)+2, NMI) -legend(['Rand', 'Jaccard', 'NMI'], loc=4) +title("Cluster validity") +plot(np.arange(K - 1) + 2, Rand) +plot(np.arange(K - 1) + 2, Jaccard) +plot(np.arange(K - 1) + 2, NMI) +legend(["Rand", "Jaccard", "NMI"], loc=4) show() -print('Ran Exercise 10.1.3') \ No newline at end of file +print("Ran Exercise 10.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py index 29e192a329d62b7b85cc5c821a962757a4a71b50..3b60ca2c570a0e45a8002ec69aed1f3f571026a8 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex10_1_5.py @@ -1,20 +1,23 @@ # exercise 10_1_5 -from matplotlib import pyplot as plt +import importlib_resources import numpy as np +from matplotlib import pyplot as plt from scipy.io import loadmat from sklearn.cluster import k_means +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wildfaces.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wildfaces.mat') -#mat_data = loadmat('../Data/digits.mat') #<-- uncomment this for using the digits dataset +mat_data = loadmat(filename) +# mat_data = loadmat('../Data/digits.mat') #<-- uncomment this for using the digits dataset -X = mat_data['X'] +X = mat_data["X"] N, M = X.shape # Image resolution and number of colors -x = 40 #<-- change this for using the digits dataset -y = 40 #<-- change this for using the digits dataset -c = 3 #<-- change this for using the digits dataset +x = 40 # <-- change this for using the digits dataset +y = 40 # <-- change this for using the digits dataset +c = 3 # <-- change this for using the digits dataset # Number of clusters: @@ -31,43 +34,48 @@ centroids, cls, inertia = k_means(X, K, verbose=True, max_iter=100, n_init=S) # Plot centroids plt.figure(1) -n1 = int(np.ceil(np.sqrt(K/2))) -n2 = int(np.ceil(float(K)/n1)) +n1 = int(np.ceil(np.sqrt(K / 2))) +n2 = int(np.ceil(float(K) / n1)) -#For black and white, cmap=plt.cm.binary, else default -cmap = plt.cm.binary if c==1 else None +# For black and white, cmap=plt.cm.binary, else default +cmap = plt.cm.binary if c == 1 else None for k in range(K): - plt.subplot(n1,n2,k+1) + plt.subplot(n1, n2, k + 1) # Reshape centroids to fit resolution and colors - img = np.reshape(centroids[k,:],(c,x,y)).T - if c == 1: # if color is single-color/gray scale + img = np.reshape(centroids[k, :], (c, x, y)).T + if c == 1: # if color is single-color/gray scale # Squeeze out singleton dimension # and flip the image (cancel out previos transpose) img = np.squeeze(img).T - plt.imshow(img,interpolation='None', cmap=cmap) - plt.xticks([]); plt.yticks([]) - if k==np.floor((n2-1)/2): plt.title('Centroids') + plt.imshow(img, interpolation="None", cmap=cmap) + plt.xticks([]) + plt.yticks([]) + if k == np.floor((n2 - 1) / 2): + plt.title("Centroids") -# Plot few randomly selected faces and their nearest centroids -L = 5 # number of images to plot +# Plot few randomly selected faces and their nearest centroids +L = 5 # number of images to plot j = np.random.randint(0, N, L) plt.figure(2) for l in range(L): - plt.subplot(2,L,l+1) - img = np.resize(X[j[l],:],(c,x,y)).T + plt.subplot(2, L, l + 1) + img = np.resize(X[j[l], :], (c, x, y)).T if c == 1: img = np.squeeze(img).T - plt.imshow(img,interpolation='None', cmap=cmap) - plt.xticks([]); plt.yticks([]) - if l==np.floor((L-1)/2): plt.title('Randomly selected faces and their centroids') - plt.subplot(2,L,L+l+1) - img = np.resize(centroids[cls[j[l]],:],(c,x,y)).T + plt.imshow(img, interpolation="None", cmap=cmap) + plt.xticks([]) + plt.yticks([]) + if l == np.floor((L - 1) / 2): + plt.title("Randomly selected faces and their centroids") + plt.subplot(2, L, L + l + 1) + img = np.resize(centroids[cls[j[l]], :], (c, x, y)).T if c == 1: img = np.squeeze(img).T - plt.imshow(img,interpolation='None', cmap=cmap) - plt.xticks([]); plt.yticks([]) + plt.imshow(img, interpolation="None", cmap=cmap) + plt.xticks([]) + plt.yticks([]) plt.show() -print('Ran Exercise 10.1.5') \ No newline at end of file +print("Ran Exercise 10.1.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex10_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex10_2_1.py index 4698d40dd98b3158fb26fd261081f65c404a67e3..d169e6a7f4aa454c47fc23cf2a5c000ab19aaf7c 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex10_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex10_2_1.py @@ -1,36 +1,42 @@ # exercise 10.2.1 +import importlib_resources from matplotlib.pyplot import figure, show +from scipy.cluster.hierarchy import dendrogram, fcluster, linkage from scipy.io import loadmat -from toolbox_02450 import clusterplot -from scipy.cluster.hierarchy import linkage, fcluster, dendrogram + +from dtuimldmtools import clusterplot + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat") # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth1.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) # Perform hierarchical/agglomerative clustering on data matrix -Method = 'single' -Metric = 'euclidean' +Method = "single" +Metric = "euclidean" Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram Maxclust = 4 -cls = fcluster(Z, criterion='maxclust', t=Maxclust) +cls = fcluster(Z, criterion="maxclust", t=Maxclust) figure(1) -clusterplot(X, cls.reshape(cls.shape[0],1), y=y) +clusterplot(X, cls.reshape(cls.shape[0], 1), y=y) # Display dendrogram -max_display_levels=6 -figure(2,figsize=(10,4)) -dendrogram(Z, truncate_mode='level', p=max_display_levels, color_threshold=Z[-Maxclust+1,2]) +max_display_levels = 6 +figure(2, figsize=(10, 4)) +dendrogram( + Z, truncate_mode="level", p=max_display_levels, color_threshold=Z[-Maxclust + 1, 2] +) show() -print('Ran Exercise 10.2.1') \ No newline at end of file +print("Ran Exercise 10.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex11_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex11_1_1.py index fe0633ba6466113677a07700ec1301f559f1e791..575da6722e7b6882f9958d6472dc6e27086594a1 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex11_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex11_1_1.py @@ -1,61 +1,72 @@ # exercise 11.1.1 -from matplotlib.pyplot import figure, show +import importlib_resources import numpy as np +from matplotlib.pyplot import figure, show from scipy.io import loadmat -from toolbox_02450 import clusterplot from sklearn.mixture import GaussianMixture + +from dtuimldmtools import clusterplot + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth2.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth2.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] -#X_old = X -#X = np.hstack([X,X]) +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] +# X_old = X +# X = np.hstack([X,X]) N, M = X.shape C = len(classNames) # Number of clusters K = 4 -cov_type = 'full' # e.g. 'full' or 'diag' +cov_type = "full" # e.g. 'full' or 'diag' # define the initialization procedure (initial value of means) -initialization_method = 'random'# 'random' or 'kmeans' +initialization_method = "random" # 'random' or 'kmeans' # random signifies random initiation, kmeans means we run a K-means and use the -# result as the starting point. K-means might converge faster/better than -# random, but might also cause the algorithm to be stuck in a poor local minimum +# result as the starting point. K-means might converge faster/better than +# random, but might also cause the algorithm to be stuck in a poor local minimum # type of covariance, you can try out 'diag' as well reps = 1 # number of fits with different initalizations, best result will be kept # Fit Gaussian mixture model -gmm = GaussianMixture(n_components=K, covariance_type=cov_type, n_init=reps, - tol=1e-6, reg_covar=1e-6, init_params=initialization_method).fit(X) -cls = gmm.predict(X) +gmm = GaussianMixture( + n_components=K, + covariance_type=cov_type, + n_init=reps, + tol=1e-6, + reg_covar=1e-6, + init_params=initialization_method, +).fit(X) +cls = gmm.predict(X) # extract cluster labels -cds = gmm.means_ +cds = gmm.means_ # extract cluster centroids (means of gaussians) covs = gmm.covariances_ # extract cluster shapes (covariances of gaussians) -if cov_type.lower() == 'diag': - new_covs = np.zeros([K,M,M]) - - count = 0 +if cov_type.lower() == "diag": + new_covs = np.zeros([K, M, M]) + + count = 0 for elem in covs: - temp_m = np.zeros([M,M]) + temp_m = np.zeros([M, M]) new_covs[count] = np.diag(elem) count += 1 covs = new_covs # Plot results: -figure(figsize=(14,9)) +figure(figsize=(14, 9)) clusterplot(X, clusterid=cls, centroids=cds, y=y, covars=covs) show() ## In case the number of features != 2, then a subset of features most be plotted instead. -#figure(figsize=(14,9)) -#idx = [0,1] # feature index, choose two features to use as x and y axis in the plot -#clusterplot(X[:,idx], clusterid=cls, centroids=cds[:,idx], y=y, covars=covs[:,idx,:][:,:,idx]) -#show() +# figure(figsize=(14,9)) +# idx = [0,1] # feature index, choose two features to use as x and y axis in the plot +# clusterplot(X[:,idx], clusterid=cls, centroids=cds[:,idx], y=y, covars=covs[:,idx,:][:,:,idx]) +# show() -print('Ran Exercise 11.1.1') \ No newline at end of file +print("Ran Exercise 11.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex11_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex11_1_5.py index 76ab3ef47eeff31545739ab8b230ed46fff0d233..3f2dfb92a138687dcd0253a825e67aacb8bbb75a 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex11_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex11_1_5.py @@ -1,27 +1,30 @@ # exercise 11.1.5 -from matplotlib.pyplot import figure, plot, legend, xlabel, show +import importlib_resources import numpy as np +from matplotlib.pyplot import figure, legend, plot, show, xlabel from scipy.io import loadmat -from sklearn.mixture import GaussianMixture from sklearn import model_selection +from sklearn.mixture import GaussianMixture + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth2.mat") # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth1.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) # Range of K's to try -KRange = range(1,11) +KRange = range(1, 11) T = len(KRange) -covar_type = 'full' # you can try out 'diag' as well -reps = 3 # number of fits with different initalizations, best result will be kept -init_procedure = 'kmeans' # 'kmeans' or 'random' +covar_type = "full" # you can try out 'diag' as well +reps = 3 # number of fits with different initalizations, best result will be kept +init_procedure = "kmeans" # 'kmeans' or 'random' # Allocate variables BIC = np.zeros((T,)) @@ -29,42 +32,48 @@ AIC = np.zeros((T,)) CVE = np.zeros((T,)) # K-fold crossvalidation -CV = model_selection.KFold(n_splits=10,shuffle=True) +CV = model_selection.KFold(n_splits=10, shuffle=True) + +for t, K in enumerate(KRange): + print("Fitting model for K={0}".format(K)) -for t,K in enumerate(KRange): - print('Fitting model for K={0}'.format(K)) + # Fit Gaussian mixture model + gmm = GaussianMixture( + n_components=K, + covariance_type=covar_type, + n_init=reps, + init_params=init_procedure, + tol=1e-6, + reg_covar=1e-6, + ).fit(X) - # Fit Gaussian mixture model - gmm = GaussianMixture(n_components=K, covariance_type=covar_type, - n_init=reps, init_params=init_procedure, - tol=1e-6, reg_covar=1e-6).fit(X) - - # Get BIC and AIC - BIC[t,] = gmm.bic(X) - AIC[t,] = gmm.aic(X) + # Get BIC and AIC + BIC[t,] = gmm.bic(X) + AIC[t,] = gmm.aic(X) - # For each crossvalidation fold - for train_index, test_index in CV.split(X): + # For each crossvalidation fold + for train_index, test_index in CV.split(X): + # extract training and test set for current CV fold + X_train = X[train_index] + X_test = X[test_index] - # extract training and test set for current CV fold - X_train = X[train_index] - X_test = X[test_index] + # Fit Gaussian mixture model to X_train + gmm = GaussianMixture( + n_components=K, covariance_type=covar_type, n_init=reps + ).fit(X_train) - # Fit Gaussian mixture model to X_train - gmm = GaussianMixture(n_components=K, covariance_type=covar_type, n_init=reps).fit(X_train) + # compute negative log likelihood of X_test + CVE[t] += -gmm.score_samples(X_test).sum() - # compute negative log likelihood of X_test - CVE[t] += -gmm.score_samples(X_test).sum() - # Plot results -figure(1); -plot(KRange, BIC,'-*b') -plot(KRange, AIC,'-xr') -plot(KRange, 2*CVE,'-ok') -legend(['BIC', 'AIC', 'Crossvalidation']) -xlabel('K') +figure(1) +plot(KRange, BIC, "-*b") +plot(KRange, AIC, "-xr") +plot(KRange, 2 * CVE, "-ok") +legend(["BIC", "AIC", "Crossvalidation"]) +xlabel("K") show() -print('Ran Exercise 11.1.5') \ No newline at end of file +print("Ran Exercise 11.1.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex11_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex11_2_1.py index bdbde574c96d0109fc76dfe21dbebd4c71557a86..f7b821cd784ccfdd11125e81c44352177fee2b0b 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex11_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex11_2_1.py @@ -12,21 +12,23 @@ M = 1 x = np.linspace(-10, 10, 50) # Allocate variable for data -X = np.empty((N,M)) +X = np.empty((N, M)) # Mean and covariances m = np.array([1, 3, 6]) -s = np.array([1, .5, 2]) +s = np.array([1, 0.5, 2]) # Draw samples from mixture of gaussians -c_sizes = np.random.multinomial(N, [1./3, 1./3, 1./3]) +c_sizes = np.random.multinomial(N, [1.0 / 3, 1.0 / 3, 1.0 / 3]) for c_id, c_size in enumerate(c_sizes): - X[c_sizes.cumsum()[c_id]-c_sizes[c_id]:c_sizes.cumsum()[c_id],:] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size,M)) + X[ + c_sizes.cumsum()[c_id] - c_sizes[c_id] : c_sizes.cumsum()[c_id], : + ] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size, M)) # Plot histogram of sampled data figure() -hist(X,x) +hist(X, x) show() -print('Ran Exercise 11.2.1') \ No newline at end of file +print("Ran Exercise 11.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex11_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex11_2_2.py index eaa71b38a5e9e1402d624599b6971ebc72e4bf75..c9007f93b3fab95c7bdfbb24cfd754e4eae5fee8 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex11_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex11_2_2.py @@ -1,16 +1,20 @@ # exercise 11.2.2 import numpy as np -from matplotlib.pyplot import figure, subplot, hist, title, show, plot +from matplotlib.pyplot import figure, hist, plot, show, subplot, title from scipy.stats.kde import gaussian_kde # Draw samples from mixture of gaussians (as in exercise 11.1.1) -N = 1000; M = 1 +N = 1000 +M = 1 x = np.linspace(-10, 10, 50) -X = np.empty((N,M)) -m = np.array([1, 3, 6]); s = np.array([1, .5, 2]) -c_sizes = np.random.multinomial(N, [1./3, 1./3, 1./3]) +X = np.empty((N, M)) +m = np.array([1, 3, 6]) +s = np.array([1, 0.5, 2]) +c_sizes = np.random.multinomial(N, [1.0 / 3, 1.0 / 3, 1.0 / 3]) for c_id, c_size in enumerate(c_sizes): - X[c_sizes.cumsum()[c_id]-c_sizes[c_id]:c_sizes.cumsum()[c_id],:] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size,M)) + X[ + c_sizes.cumsum()[c_id] - c_sizes[c_id] : c_sizes.cumsum()[c_id], : + ] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size, M)) # x-values to evaluate the KDE @@ -20,13 +24,13 @@ xe = np.linspace(-10, 10, 100) kde = gaussian_kde(X.ravel()) # Plot kernel density estimate -figure(figsize=(6,7)) -subplot(2,1,1) -hist(X,x) -title('Data histogram') -subplot(2,1,2) +figure(figsize=(6, 7)) +subplot(2, 1, 1) +hist(X, x) +title("Data histogram") +subplot(2, 1, 2) plot(xe, kde.evaluate(xe)) -title('Kernel density estimate') +title("Kernel density estimate") show() -print('Ran Exercise 11.2.2') \ No newline at end of file +print("Ran Exercise 11.2.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex11_2_3.py b/exercises/02450Toolbox_Python/Scripts/ex11_2_3.py index d0e24d6c113629df94410bca85b7aea4b37ab576..0119997e3a3f55e5a352d74f07ae7a7e1f605514 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex11_2_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex11_2_3.py @@ -1,16 +1,20 @@ # exercise 11.2.3 import numpy as np -from matplotlib.pyplot import figure, subplot, plot, hist, title, show +from matplotlib.pyplot import figure, hist, plot, show, subplot, title from sklearn.neighbors import NearestNeighbors # Draw samples from mixture of gaussians (as in exercise 11.1.1) -N = 1000; M = 1 +N = 1000 +M = 1 x = np.linspace(-10, 10, 50) -X = np.empty((N,M)) -m = np.array([1, 3, 6]); s = np.array([1, .5, 2]) -c_sizes = np.random.multinomial(N, [1./3, 1./3, 1./3]) +X = np.empty((N, M)) +m = np.array([1, 3, 6]) +s = np.array([1, 0.5, 2]) +c_sizes = np.random.multinomial(N, [1.0 / 3, 1.0 / 3, 1.0 / 3]) for c_id, c_size in enumerate(c_sizes): - X[c_sizes.cumsum()[c_id]-c_sizes[c_id]:c_sizes.cumsum()[c_id],:] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size,M)) + X[ + c_sizes.cumsum()[c_id] - c_sizes[c_id] : c_sizes.cumsum()[c_id], : + ] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size, M)) # Number of neighbors @@ -21,34 +25,36 @@ xe = np.linspace(-10, 10, 100) # Find the k nearest neighbors knn = NearestNeighbors(n_neighbors=K).fit(X) -D, i = knn.kneighbors(np.expand_dims(xe,axis=-1)) # note expand_dims is simple to make it (100,1) and not (100,) array +D, i = knn.kneighbors( + np.expand_dims(xe, axis=-1) +) # note expand_dims is simple to make it (100,1) and not (100,) array # Compute the density -knn_density = 1./(D[:,1:].sum(axis=1)/K) +knn_density = 1.0 / (D[:, 1:].sum(axis=1) / K) # Compute the average relative density DX, iX = knn.kneighbors(X) -knn_densityX = 1./(DX[:,1:].sum(axis=1)/K) -knn_avg_rel_density = knn_density/(knn_densityX[i[:,1:]].sum(axis=1)/K) +knn_densityX = 1.0 / (DX[:, 1:].sum(axis=1) / K) +knn_avg_rel_density = knn_density / (knn_densityX[i[:, 1:]].sum(axis=1) / K) # Plot KNN density -figure(figsize=(6,7)) -subplot(2,1,1) -hist(X,x) -title('Data histogram') -subplot(2,1,2) +figure(figsize=(6, 7)) +subplot(2, 1, 1) +hist(X, x) +title("Data histogram") +subplot(2, 1, 2) plot(xe, knn_density) -title('KNN density') +title("KNN density") # Plot KNN average relative density -figure(figsize=(6,7)) -subplot(2,1,1) -hist(X,x) -title('Data histogram') -subplot(2,1,2) +figure(figsize=(6, 7)) +subplot(2, 1, 1) +hist(X, x) +title("Data histogram") +subplot(2, 1, 2) plot(xe, knn_avg_rel_density) -title('KNN average relative density') +title("KNN average relative density") show() -print('Ran Exercise 11.2.3') \ No newline at end of file +print("Ran Exercise 11.2.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex11_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex11_3_1.py index 270db597c3db29557ca357b9ba1f97335f345970..92b652a956db13981ae4398fcec15e4cdc62c078 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex11_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex11_3_1.py @@ -1,18 +1,21 @@ # exercise 11.3.1 import numpy as np -from matplotlib.pyplot import figure, bar, title, show +from matplotlib.pyplot import bar, figure, show, title from scipy.stats.kde import gaussian_kde - # Draw samples from mixture of gaussians (as in exercise 11.1.1), add outlier -N = 1000; M = 1 +N = 1000 +M = 1 x = np.linspace(-10, 10, 50) -X = np.empty((N,M)) -m = np.array([1, 3, 6]); s = np.array([1, .5, 2]) -c_sizes = np.random.multinomial(N, [1./3, 1./3, 1./3]) +X = np.empty((N, M)) +m = np.array([1, 3, 6]) +s = np.array([1, 0.5, 2]) +c_sizes = np.random.multinomial(N, [1.0 / 3, 1.0 / 3, 1.0 / 3]) for c_id, c_size in enumerate(c_sizes): - X[c_sizes.cumsum()[c_id]-c_sizes[c_id]:c_sizes.cumsum()[c_id],:] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size,M)) -X[-1,0]=-10 # added outlier + X[ + c_sizes.cumsum()[c_id] - c_sizes[c_id] : c_sizes.cumsum()[c_id], : + ] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size, M)) +X[-1, 0] = -10 # added outlier # Compute kernel density estimate @@ -22,12 +25,12 @@ scores = kde.evaluate(X.ravel()) idx = scores.argsort() scores.sort() -print('The index of the lowest density object: {0}'.format(idx[0])) +print("The index of the lowest density object: {0}".format(idx[0])) # Plot kernel density estimate figure() -bar(range(20),scores[:20]) -title('Outlier score') +bar(range(20), scores[:20]) +title("Outlier score") show() -print('Ran Exercise 11.3.1') \ No newline at end of file +print("Ran Exercise 11.3.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex11_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex11_3_2.py index 140b8c832f34b2f70e1ae1dabbc659a311907c73..8b3d919fa92feff3cc6690fa9db987028e72181d 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex11_3_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex11_3_2.py @@ -1,31 +1,35 @@ # exercise 11.3.2 import numpy as np -from matplotlib.pyplot import figure, bar, title, plot, show -from toolbox_02450 import gausKernelDensity +from matplotlib.pyplot import bar, figure, plot, show, title + +from dtuimldmtools import gausKernelDensity # Draw samples from mixture of gaussians (as in exercise 11.1.1) -N = 1000; M = 1 +N = 1000 +M = 1 x = np.linspace(-10, 10, 50) -X = np.empty((N,M)) -m = np.array([1, 3, 6]); s = np.array([1, .5, 2]) -c_sizes = np.random.multinomial(N, [1./3, 1./3, 1./3]) +X = np.empty((N, M)) +m = np.array([1, 3, 6]) +s = np.array([1, 0.5, 2]) +c_sizes = np.random.multinomial(N, [1.0 / 3, 1.0 / 3, 1.0 / 3]) for c_id, c_size in enumerate(c_sizes): - X[c_sizes.cumsum()[c_id]-c_sizes[c_id]:c_sizes.cumsum()[c_id],:] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size,M)) - + X[ + c_sizes.cumsum()[c_id] - c_sizes[c_id] : c_sizes.cumsum()[c_id], : + ] = np.random.normal(m[c_id], np.sqrt(s[c_id]), (c_size, M)) # Estimate the optimal kernel density width, by leave-one-out cross-validation -widths = 2.0**np.arange(-10,10) +widths = 2.0 ** np.arange(-10, 10) logP = np.zeros(np.size(widths)) -for i,w in enumerate(widths): +for i, w in enumerate(widths): f, log_f = gausKernelDensity(X, w) logP[i] = log_f.sum() val = logP.max() ind = logP.argmax() -width=widths[ind] -print('Optimal estimated width is: {0}'.format(width)) +width = widths[ind] +print("Optimal estimated width is: {0}".format(width)) # Estimate density for each observation not including the observation # itself in the density estimate @@ -36,15 +40,20 @@ i = (density.argsort(axis=0)).ravel() density = density[i] # Display the index of the lowest density data object -print('Lowest density: {0} for data object: {1}'.format(density[0],i[0])) +print("Lowest density: {0} for data object: {1}".format(density[0], i[0])) # Plot density estimate of outlier score figure(1) -bar(range(20),density[:20].reshape(-1,)) -title('Density estimate') +bar( + range(20), + density[:20].reshape( + -1, + ), +) +title("Density estimate") figure(2) plot(logP) -title('Optimal width') +title("Optimal width") show() -print('Ran Exercise 11.3.2') \ No newline at end of file +print("Ran Exercise 11.3.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex11_4_1.py b/exercises/02450Toolbox_Python/Scripts/ex11_4_1.py index c59b61fd8e79916fbce159ddba4a0be957c682e5..7edc22bc819f81c44826c9ffede44a23261b1cff 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex11_4_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex11_4_1.py @@ -1,99 +1,122 @@ # exercise 11.4.1 +import importlib_resources import numpy as np -from matplotlib.pyplot import (figure, imshow, bar, title, xticks, yticks, cm, - subplot, show) +from matplotlib.pyplot import ( + bar, + cm, + figure, + imshow, + show, + subplot, + title, + xticks, + yticks, +) from scipy.io import loadmat -from toolbox_02450 import gausKernelDensity from sklearn.neighbors import NearestNeighbors +from dtuimldmtools import gausKernelDensity + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat") + # load data from Matlab data file -matdata = loadmat('../Data/digits.mat') -X = np.array(matdata['X']) -y = np.array(matdata['y']) +matdata = loadmat(filename) +X = np.array(matdata["X"]) +y = np.array(matdata["y"]) N, M = np.shape(X) # Restrict the data to images of "2" -X = X[y.ravel()==2,:] +X = X[y.ravel() == 2, :] N, M = np.shape(X) ### Gausian Kernel density estimator # cross-validate kernel width by leave-one-out-cross-validation # (efficient implementation in gausKernelDensity function) # evaluate for range of kernel widths -widths = X.var(axis=0).max() * (2.0**np.arange(-10,3)) +widths = X.var(axis=0).max() * (2.0 ** np.arange(-10, 3)) logP = np.zeros(np.size(widths)) -for i,w in enumerate(widths): - print('Fold {:2d}, w={:f}'.format(i,w)) - density, log_density = gausKernelDensity(X,w) - logP[i] = log_density.sum() - +for i, w in enumerate(widths): + print("Fold {:2d}, w={:f}".format(i, w)) + density, log_density = gausKernelDensity(X, w) + logP[i] = log_density.sum() + val = logP.max() ind = logP.argmax() -width=widths[ind] -print('Optimal estimated width is: {0}'.format(width)) +width = widths[ind] +print("Optimal estimated width is: {0}".format(width)) # evaluate density for estimated width -density, log_density = gausKernelDensity(X,width) +density, log_density = gausKernelDensity(X, width) # Sort the densities i = (density.argsort(axis=0)).ravel() -density = density[i].reshape(-1,) +density = density[i].reshape( + -1, +) # Plot density estimate of outlier score figure(1) -bar(range(20),density[:20]) -title('Density estimate') +bar(range(20), density[:20]) +title("Density estimate") # Plot possible outliers figure(2) -for k in range(1,21): - subplot(4,5,k) - imshow(np.reshape(X[i[k],:], (16,16)).T, cmap=cm.binary) - xticks([]); yticks([]) - if k==3: title('Gaussian Kernel Density: Possible outliers') - +for k in range(1, 21): + subplot(4, 5, k) + imshow(np.reshape(X[i[k], :], (16, 16)).T, cmap=cm.binary) + xticks([]) + yticks([]) + if k == 3: + title("Gaussian Kernel Density: Possible outliers") ### K-neighbors density estimator # Neighbor to use: K = 5 -knn = NearestNeighbors(n_neighbors=K+1).fit(X) -def density(X,i): - ''' +knn = NearestNeighbors(n_neighbors=K + 1).fit(X) + + +def density(X, i): + """ Compute density at observation i in X using LOO. Note this code can easily be vectorized for speed - ''' - D, _ = knn.kneighbors(np.expand_dims(X[i],axis=0)) + """ + D, _ = knn.kneighbors(np.expand_dims(X[i], axis=0)) # don't compute distance to observation itself. - density = 1. / D[:, 1:].mean(axis=1) + density = 1.0 / D[:, 1:].mean(axis=1) return density -dens = np.concatenate([density(X,i) for i in range(N)]) + +dens = np.concatenate([density(X, i) for i in range(N)]) # Sort the scores i = dens.argsort() dens = dens[i] # Plot k-neighbor estimate of outlier score (distances) figure(3) -bar(range(20),dens[:20]) -title('KNN density: Outlier score') +bar(range(20), dens[:20]) +title("KNN density: Outlier score") # Plot possible outliers figure(4) -for k in range(1,21): - subplot(4,5,k) - imshow(np.reshape(X[i[k],:], (16,16)).T, cmap=cm.binary) - xticks([]); yticks([]) - if k==3: title('KNN density: Possible outliers') +for k in range(1, 21): + subplot(4, 5, k) + imshow(np.reshape(X[i[k], :], (16, 16)).T, cmap=cm.binary) + xticks([]) + yticks([]) + if k == 3: + title("KNN density: Possible outliers") + ### K-nearest neigbor average relative density # Compute the average relative density -def ard(X,i): - _, J = knn.kneighbors(np.expand_dims(X[i],axis=0)) - J = J[0,1:] # don't include i itself. - return density(X,i) / np.mean( [density(X, j) for j in J] ) +def ard(X, i): + _, J = knn.kneighbors(np.expand_dims(X[i], axis=0)) + J = J[0, 1:] # don't include i itself. + return density(X, i) / np.mean([density(X, j) for j in J]) + -avg_rel_density = np.concatenate( [ard(X,i) for i in range(N) ] ) +avg_rel_density = np.concatenate([ard(X, i) for i in range(N)]) # Sort the avg.rel.densities i_avg_rel = avg_rel_density.argsort() @@ -101,49 +124,55 @@ avg_rel_density = avg_rel_density[i_avg_rel] # Plot k-neighbor estimate of outlier score (distances) figure(5) -bar(range(20),avg_rel_density[:20]) -title('KNN average relative density: Outlier score') +bar(range(20), avg_rel_density[:20]) +title("KNN average relative density: Outlier score") # Plot possible outliers figure(6) -for k in range(1,21): - subplot(4,5,k) - imshow(np.reshape(X[i_avg_rel[k],:], (16,16)).T, cmap=cm.binary) - xticks([]); yticks([]) - if k==3: title('KNN average relative density: Possible outliers') +for k in range(1, 21): + subplot(4, 5, k) + imshow(np.reshape(X[i_avg_rel[k], :], (16, 16)).T, cmap=cm.binary) + xticks([]) + yticks([]) + if k == 3: + title("KNN average relative density: Possible outliers") ### Distance to 5'th nearest neighbor outlier score K = 5 # Find the k nearest neighbors -knn = NearestNeighbors(n_neighbors=K+1).fit(X) +knn = NearestNeighbors(n_neighbors=K + 1).fit(X) D, i = knn.kneighbors(X) # Outlier score -score = D[:,K-1] +score = D[:, K - 1] # Sort the scores i = score.argsort() score = score[i[::-1]] # Plot k-neighbor estimate of outlier score (distances) figure(7) -bar(range(20),score[:20]) -title('5th neighbor distance: Outlier score') +bar(range(20), score[:20]) +title("5th neighbor distance: Outlier score") # Plot possible outliers figure(8) -for k in range(1,21): - subplot(4,5,k) - imshow(np.reshape(X[i[k],:], (16,16)).T, cmap=cm.binary); - xticks([]); yticks([]) - if k==3: title('5th neighbor distance: Possible outliers') +for k in range(1, 21): + subplot(4, 5, k) + imshow(np.reshape(X[i[k], :], (16, 16)).T, cmap=cm.binary) + xticks([]) + yticks([]) + if k == 3: + title("5th neighbor distance: Possible outliers") # Plot random digits (the first 20 in the data set), for comparison figure(9) -for k in range(1,21): - subplot(4,5,k); - imshow(np.reshape(X[k,:], (16,16)).T, cmap=cm.binary); - xticks([]); yticks([]) - if k==3: title('Random digits from data set') +for k in range(1, 21): + subplot(4, 5, k) + imshow(np.reshape(X[k, :], (16, 16)).T, cmap=cm.binary) + xticks([]) + yticks([]) + if k == 3: + title("Random digits from data set") show() -print('Ran Exercise 11.4.1') \ No newline at end of file +print("Ran Exercise 11.4.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex12_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex12_1_3.py index 4c2a2b5eecc5ff52dd27ecc74f0ec0e484343f2d..6bc830706101b3427c3ff9dc9b0b7211711ab442 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex12_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex12_1_3.py @@ -1,21 +1,25 @@ # ex12_1_3 +import importlib_resources import numpy as np + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/courses.txt") + # Load data. There is probably a library-way to parse the file but we will take the scenic route -with open('../Data/courses.txt','r') as f: +with open(filename, "r") as f: D = f.read() print("Raw data matrix is:") print(D) -D = [ [int(x) for x in ds.split(",")] for ds in D.split('\n') if len(ds) > 0] +D = [[int(x) for x in ds.split(",")] for ds in D.split("\n") if len(ds) > 0] N = len(D) -M = max( [max(v) for v in D]) -X = np.zeros( (N,M)) +M = max([max(v) for v in D]) +X = np.zeros((N, M)) for i in range(N): - d_m1 = [j-1 for j in D[i]] - X[i,d_m1] = 1 + d_m1 = [j - 1 for j in D[i]] + X[i, d_m1] = 1 # We should now have the correct binary data matrix: labels = ["02322", "02450", "02451", "02453", "02454", "02457", "02459", "02582"] print("Transformed data matrix X is:") print(labels) print(X) -print("All-done!") \ No newline at end of file +print("All-done!") diff --git a/exercises/02450Toolbox_Python/Scripts/ex12_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex12_1_4.py index 757509330b42e4f8bb51f5fd1a24040f5e90e7b2..2ab9035cdc670eb2b115c616c9e95e08effd3e71 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex12_1_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex12_1_4.py @@ -1,8 +1,10 @@ # install apyori using a standard method: conda install apyori or pip install apyori -from apyori import apriori # Load resources from previous exercise import numpy as np -from ex12_1_3 import X,labels +from apyori import apriori +from ex12_1_3 import X, labels + + # ex12_1_4 # This is a helper function that transforms a binary matrix into transactions. # Note the format used for courses.txt was (nearly) in a transaction format, @@ -17,9 +19,11 @@ def mat2transactions(X, labels=[]): T.append(l) return T + # apyori requires data to be in a transactions format, forunately we just wrote a helper function to do that. -T = mat2transactions(X,labels) -rules = apriori( T, min_support=0.8, min_confidence=1) +T = mat2transactions(X, labels) +rules = apriori(T, min_support=0.8, min_confidence=1) + # This function print the found rules and also returns a list of rules in the format: # [(x,y), ...] @@ -27,13 +31,15 @@ rules = apriori( T, min_support=0.8, min_confidence=1) def print_apriori_rules(rules): frules = [] for r in rules: - for o in r.ordered_statistics: + for o in r.ordered_statistics: conf = o.confidence supp = r.support - x = ", ".join( list( o.items_base ) ) - y = ", ".join( list( o.items_add ) ) - print("{%s} -> {%s} (supp: %.3f, conf: %.3f)"%(x,y, supp, conf)) - frules.append( (x,y) ) + x = ", ".join(list(o.items_base)) + y = ", ".join(list(o.items_add)) + print("{%s} -> {%s} (supp: %.3f, conf: %.3f)" % (x, y, supp, conf)) + frules.append((x, y)) return frules + + # Print rules found in the courses file. -print_apriori_rules(rules) \ No newline at end of file +print_apriori_rules(rules) diff --git a/exercises/02450Toolbox_Python/Scripts/ex12_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex12_1_5.py index 3b485fdb59ede347fd0c2a3ef70af19048c14b2c..a7e7ad6459f4154a844db0dfffb7a2dcc981c295 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex12_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex12_1_5.py @@ -1,14 +1,21 @@ # ex12_1_5 # Load data from the wine dataset +import importlib_resources from scipy.io import loadmat -mat_data = loadmat('../Data/wine.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0][0] for name in mat_data['attributeNames']] + +from dtuimldmtools import binarize2 + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat") + +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0][0] for name in mat_data["attributeNames"]] # We will now transform the wine dataset into a binary format. Notice the changed attribute names: -from toolbox_02450.similarity import binarize2 + + Xbin, attributeNamesBin = binarize2(X, attributeNames) print("X, i.e. the wine dataset, has now been transformed into:") print(Xbin) -print(attributeNamesBin) \ No newline at end of file +print(attributeNamesBin) diff --git a/exercises/02450Toolbox_Python/Scripts/ex12_1_6.py b/exercises/02450Toolbox_Python/Scripts/ex12_1_6.py index ce9d3ce12747d21c8455f08b8a330dbd85c46f17..91a9200360ccd481a545b9235a21fc43e1e2c842 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex12_1_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex12_1_6.py @@ -1,10 +1,10 @@ # ex12_1_6 # Load resources from previous exercise +from apyori import apriori from ex12_1_4 import mat2transactions, print_apriori_rules from ex12_1_5 import Xbin, attributeNamesBin -from apyori import apriori # Given the processed data in the previous exercise this becomes easy: -T = mat2transactions(Xbin,labels=attributeNamesBin) -rules = apriori(T, min_support=0.3, min_confidence=.6) -print_apriori_rules(rules) \ No newline at end of file +T = mat2transactions(Xbin, labels=attributeNamesBin) +rules = apriori(T, min_support=0.3, min_confidence=0.6) +print_apriori_rules(rules) diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_5_1.py b/exercises/02450Toolbox_Python/Scripts/ex1_5_1.py index 334a7c01e2daa85e194d3a3d251cc07ccbb5ade8..d5c7463e0194f997d6ae4e0a1b4825bf6f128b92 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex1_5_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex1_5_1.py @@ -1,75 +1,75 @@ # exercise 1.5.1 +import importlib_resources import numpy as np import pandas as pd # Load the Iris csv data using the Pandas library -filename = '../Data/iris.csv' +filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.csv") df = pd.read_csv(filename) # Pandas returns a dataframe, (df) which could be used for handling the data. -# We will however convert the dataframe to numpy arrays for this course as +# We will however convert the dataframe to numpy arrays for this course as # is also described in the table in the exercise -raw_data = df.values +raw_data = df.values # Notice that raw_data both contains the information we want to store in an array -# X (the sepal and petal dimensions) and the information that we wish to store +# X (the sepal and petal dimensions) and the information that we wish to store # in y (the class labels, that is the iris species). # We start by making the data matrix X by indexing into data. -# We know that the attributes are stored in the four columns from inspecting +# We know that the attributes are stored in the four columns from inspecting # the file. -cols = range(0, 4) +cols = range(0, 4) X = raw_data[:, cols] # We can extract the attribute names that came from the header of the csv attributeNames = np.asarray(df.columns[cols]) # Before we can store the class index, we need to convert the strings that -# specify the class of a given object to a numerical value. We start by +# specify the class of a given object to a numerical value. We start by # extracting the strings for each sample from the raw data loaded from the csv: -classLabels = raw_data[:,-1] # -1 takes the last column -# Then determine which classes are in the data by finding the set of -# unique class labels +classLabels = raw_data[:, -1] # -1 takes the last column +# Then determine which classes are in the data by finding the set of +# unique class labels classNames = np.unique(classLabels) # We can assign each type of Iris class with a number by making a # Python dictionary as so: -classDict = dict(zip(classNames,range(len(classNames)))) +classDict = dict(zip(classNames, range(len(classNames)))) # The function zip simply "zips" togetter the classNames with an integer, -# like a zipper on a jacket. +# like a zipper on a jacket. # For instance, you could zip a list ['A', 'B', 'C'] with ['D', 'E', 'F'] to -# get the pairs ('A','D'), ('B', 'E'), and ('C', 'F'). -# A Python dictionary is a data object that stores pairs of a key with a value. -# This means that when you call a dictionary with a given key, you +# get the pairs ('A','D'), ('B', 'E'), and ('C', 'F'). +# A Python dictionary is a data object that stores pairs of a key with a value. +# This means that when you call a dictionary with a given key, you # get the stored corresponding value. Try highlighting classDict and press F9. -# You'll see that the first (key, value)-pair is ('Iris-setosa', 0). -# If you look up in the dictionary classDict with the value 'Iris-setosa', +# You'll see that the first (key, value)-pair is ('Iris-setosa', 0). +# If you look up in the dictionary classDict with the value 'Iris-setosa', # you will get the value 0. Try it with classDict['Iris-setosa'] # With the dictionary, we can look up each data objects class label (the string) -# in the dictionary, and determine which numerical value that object is +# in the dictionary, and determine which numerical value that object is # assigned. This is the class index vector y: y = np.array([classDict[cl] for cl in classLabels]) # In the above, we have used the concept of "list comprehension", which # is a compact way of performing some operations on a list or array. -# You could read the line "For each class label (cl) in the array of +# You could read the line "For each class label (cl) in the array of # class labels (classLabels), use the class label (cl) as the key and look up # in the class dictionary (classDict). Store the result for each class label -# as an element in a list (because of the brackets []). Finally, convert the -# list to a numpy array". -# Try running this to get a feel for the operation: +# as an element in a list (because of the brackets []). Finally, convert the +# list to a numpy array". +# Try running this to get a feel for the operation: # list = [0,1,2] # new_list = [element+10 for element in list] -# We can determine the number of data objects and number of attributes using +# We can determine the number of data objects and number of attributes using # the shape of X N, M = X.shape -# Finally, the last variable that we need to have the dataset in the +# Finally, the last variable that we need to have the dataset in the # "standard representation" for the course, is the number of classes, C: C = len(classNames) - diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_5_2.py b/exercises/02450Toolbox_Python/Scripts/ex1_5_2.py index 102ed00943dd83b92f579d154907f95055a764ef..7c28d1b6acd79a543adeefcf2e8f0e19d5c3fde8 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex1_5_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex1_5_2.py @@ -1,38 +1,43 @@ # exercise 1.5.2 +import importlib_resources import numpy as np # You can read data from excel spreadsheets after installing and importing xlrd # module. In most cases, you will need only few functions to accomplish it: # open_workbook(), col_values(), row_values() import xlrd -# If you need more advanced reference, or if you are interested how to write + +# If you need more advanced reference, or if you are interested how to write # data to excel files, see the following tutorial: # http://www.simplistix.co.uk/presentations/python-excel.pdf} # Load xls sheet with data # There's only a single sheet in the .xls, so we take out that sheet -doc = xlrd.open_workbook('../Data/iris.xls').sheet_by_index(0) + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls") + +doc = xlrd.open_workbook(filename).sheet_by_index(0) # Extract attribute names attributeNames = doc.row_values(rowx=0, start_colx=0, end_colx=4) # Try calling help(doc.row_values). You'll see that the above means -# that we extract columns 0 through 4 from the first row of the document, +# that we extract columns 0 through 4 from the first row of the document, # which contains the header of the xls files (where the attributen names are) -# Extract class names to python list, then encode with integers (dict) just as -# we did previously. The class labels are in the 5th column, in the rows 2 to +# Extract class names to python list, then encode with integers (dict) just as +# we did previously. The class labels are in the 5th column, in the rows 2 to # and up to 151: -classLabels = doc.col_values(4,1,151) # check out help(doc.col_values) +classLabels = doc.col_values(4, 1, 151) # check out help(doc.col_values) classNames = sorted(set(classLabels)) -classDict = dict(zip(classNames,range(len(classNames)))) +classDict = dict(zip(classNames, range(len(classNames)))) # Extract vector y, convert to NumPy array y = np.array([classDict[value] for value in classLabels]) # Preallocate memory, then extract data to matrix X -X = np.empty((150,4)) +X = np.empty((150, 4)) for i in range(4): - X[:,i] = np.array(doc.col_values(i,1,151)).T + X[:, i] = np.array(doc.col_values(i, 1, 151)).T # Compute values of N, M and C. N = len(y) diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_5_3.py b/exercises/02450Toolbox_Python/Scripts/ex1_5_3.py index f91c899871c876436c10bbc2dc6c557cf3746df2..97b969279de7da78251f587b795c9368310852a3 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex1_5_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex1_5_3.py @@ -1,34 +1,36 @@ # exercise 1.5.3 -import numpy as np - +import importlib_resources from scipy.io import loadmat -# You can load the matlab data (matlab's m-file) to Python environment with -# 'loadmat()' function imported from 'scipy.io' module. -# The matlab workspace is loaded as a dictionary, with keys corresponding to + +# You can load the matlab data (matlab's m-file) to Python environment with +# 'loadmat()' function imported from 'scipy.io' module. +# The matlab workspace is loaded as a dictionary, with keys corresponding to # matlab variable names, and values to arrays representing matlab matrices. # Load Matlab data file to python dict structure -iris_mat = loadmat('../Data/iris.mat', squeeze_me=True) -# The argument squeeze_me ensures that there the variables we get from the + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.mat") +iris_mat = loadmat(filename, squeeze_me=True) +# The argument squeeze_me ensures that there the variables we get from the # MATLAB filed are not stored within "unneeded" array dimensions. # You can check which variables are in the loaded dict by calling # the function keys() for the dict: -#mat_data.keys() +# mat_data.keys() # this will tell you that X, y, M, N and C are stored in the dictionary, # as well as some extra information about e.g. the used MATLAB version. # We'll extract the needed variables by using these keys: -X = iris_mat['X'] -y = iris_mat['y'] -M = iris_mat['M'] -N = iris_mat['N'] -C = iris_mat['C'] -attributeNames = iris_mat['attributeNames'] -classNames = iris_mat['classNames'] +X = iris_mat["X"] +y = iris_mat["y"] +M = iris_mat["M"] +N = iris_mat["N"] +C = iris_mat["C"] +attributeNames = iris_mat["attributeNames"] +classNames = iris_mat["classNames"] # Loading the Iris data from the .mat-file was quite easy, because all the work -# of putting it into the correct format was already done. This is of course -# likely not the case for your own data, where you'll need to do something -# similar to the two previous exercises. We will, however, sometimes in the -# course use .mat-files in the exercises. \ No newline at end of file +# of putting it into the correct format was already done. This is of course +# likely not the case for your own data, where you'll need to do something +# similar to the two previous exercises. We will, however, sometimes in the +# course use .mat-files in the exercises. diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_5_4.py b/exercises/02450Toolbox_Python/Scripts/ex1_5_4.py index 19bc9ad0ee2b9e08c20594d28140d84d70f8374c..f853fb3ff99838acaa4c36cb94d7d700cf852958 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex1_5_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex1_5_4.py @@ -1,10 +1,9 @@ ## exercise 1.5.4 # Start by running the exercise 1.5.3 to load the Iris data in # "classification format": -from ex1_5_3 import * - -import numpy as np import matplotlib.pyplot as plt +import numpy as np +from ex1_5_3 import * ## Classification problem # The current variables X and y represent a classification problem, in @@ -13,19 +12,18 @@ import matplotlib.pyplot as plt # the variable y). A relevant figure for this classification problem could # for instance be one that shows how the classes are distributed based on # two attributes in matrix X: -X_c = X.copy(); -y_c = y.copy(); -attributeNames_c = attributeNames.copy(); -i = 1; j = 2; -color = ['r','g', 'b'] -plt.title('Iris classification problem') +X_c = X.copy() +y_c = y.copy() +attributeNames_c = attributeNames.copy() +i = 1 +j = 2 +color = ["r", "g", "b"] +plt.title("Iris classification problem") for c in range(len(classNames)): idx = y_c == c - plt.scatter(x=X_c[idx, i], - y=X_c[idx, j], - c=color[c], - s=50, alpha=0.5, - label=classNames[c]) + plt.scatter( + x=X_c[idx, i], y=X_c[idx, j], c=color[c], s=50, alpha=0.5, label=classNames[c] + ) plt.legend() plt.xlabel(attributeNames_c[i]) plt.ylabel(attributeNames_c[j]) @@ -39,21 +37,21 @@ plt.show() # petal length cannot any longer be in the data matrix X. # The first thing we do is store all the information we have in the # other format in one data matrix: -data = np.concatenate((X_c, np.expand_dims(y_c,axis=1)), axis=1) +data = np.concatenate((X_c, np.expand_dims(y_c, axis=1)), axis=1) # We need to do expand_dims to y_c for the dimensions of X_c and y_c to fit. # We know that the petal length corresponds to the third column in the data # matrix (see attributeNames), and therefore our new y variable is: y_r = data[:, 2] -# Similarly, our new X matrix is all the other information but without the +# Similarly, our new X matrix is all the other information but without the # petal length (since it's now the y variable): X_r = data[:, [0, 1, 3, 4]] # Since the iris class information (which is now the last column in X_r) is a # categorical variable, we will do a one-out-of-K encoding of the variable: species = np.array(X_r[:, -1], dtype=int).T -K = species.max()+1 +K = species.max() + 1 species_encoding = np.zeros((species.size, K)) species_encoding[np.arange(species.size), species] = 1 # The encoded information is now a 150x3 matrix. This corresponds to 150 @@ -63,33 +61,31 @@ species_encoding[np.arange(species.size), species] = 1 # We need to replace the last column in X (which was the not encoded # version of the species data) with the encoded version: -X_r = np.concatenate( (X_r[:, :-1], species_encoding), axis=1) +X_r = np.concatenate((X_r[:, :-1], species_encoding), axis=1) # Now, X is of size 150x6 corresponding to the three measurements of the # Iris that are not the petal length as well as the three variables that # specifies whether or not a given observations is or isn't a certain type. -# We need to update the attribute names and store the petal length name +# We need to update the attribute names and store the petal length name # as the name of the target variable for a regression: targetName_r = attributeNames_c[2] -attributeNames_r = np.concatenate((attributeNames_c[[0, 1, 3]], classNames), - axis=0) +attributeNames_r = np.concatenate((attributeNames_c[[0, 1, 3]], classNames), axis=0) # Lastly, we update M, since we now have more attributes: -N,M = X_r.shape +N, M = X_r.shape # A relevant figure for this regression problem could # for instance be one that shows how the target, that is the petal length, # changes with one of the predictors in X: -i = 2 -plt.title('Iris regression problem') -plt.plot(X_r[:, i], y_r, 'o') -plt.xlabel(attributeNames_r[i]); -plt.ylabel(targetName_r); +i = 2 +plt.title("Iris regression problem") +plt.plot(X_r[:, i], y_r, "o") +plt.xlabel(attributeNames_r[i]) +plt.ylabel(targetName_r) # Consider if you see a relationship between the predictor variable on the # x-axis (the variable from X) and the target variable on the y-axis (the # variable y). Could you draw a straight line through the data points for -# any of the attributes (choose different i)? -# Note that, when i is 3, 4, or 5, the x-axis is based on a binary -# variable, in which case a scatter plot is not as such the best option for -# visulizing the information. - +# any of the attributes (choose different i)? +# Note that, when i is 3, 4, or 5, the x-axis is based on a binary +# variable, in which case a scatter plot is not as such the best option for +# visulizing the information. diff --git a/exercises/02450Toolbox_Python/Scripts/ex1_5_5.py b/exercises/02450Toolbox_Python/Scripts/ex1_5_5.py index 0e90359b6591b44a85fbfc9287fe08c9ed0b64b5..052b3abbfab29fc56d968be65e921fdfb604855c 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex1_5_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex1_5_5.py @@ -1,21 +1,21 @@ ## exercise 1.5.5 -import numpy as np import matplotlib.pyplot as plt +import numpy as np # In this exercise we will rely on pandas for some of the processing steps: import pandas as pd # We start by defining the path to the file that we're we need to load. # Upon inspection, we saw that the messy_data.data was infact a file in the -# format of a CSV-file with a ".data" extention instead. -file_path = '../Data/messy_data/messy_data.data' +# format of a CSV-file with a ".data" extention instead. +file_path = "../data/messy_data/messy_data.data" # First of we simply read the file in using readtable, however, we need to # tell the function that the file is tab-seperated. We also need to specify # that the header is in the second row: -messy_data = pd.read_csv(file_path, sep='\t', header=1) +messy_data = pd.read_csv(file_path, sep="\t", header=1) # We also need to remove the added header line in the .data file which seems # to have included a shortend form the variables (check messy_data.head()): -messy_data = messy_data.drop(messy_data.index[0]) +messy_data = messy_data.drop(messy_data.index[0]) # We extract the attribute names: attributeNames = np.asarray(messy_data.columns) @@ -27,31 +27,31 @@ attributeNames = np.asarray(messy_data.columns) # car make and model. We decide to extract this in a variable for itself # for now, and then remove it from messy_data: car_names = np.array(messy_data.carname) -messy_data = messy_data.drop(['carname'], axis=1) +messy_data = messy_data.drop(["carname"], axis=1) # Inspect messy data by e.g.: -#print(messy_data.to_string()) +# print(messy_data.to_string()) # At this point, youll see that some of the missing values from the data -# has already been represented as NaNs (in the displacement column). +# has already been represented as NaNs (in the displacement column). # However, these were only the places where an empty element was in the file. # First off, we remove the question marks in displacement and replace # them with not a number, NaN: -messy_data.displacement = messy_data.displacement.str.replace('?','NaN') +messy_data.displacement = messy_data.displacement.str.replace("?", "NaN") # Similarly, we remove the formatting for a thousand seperator that is # present for the weight attribute: -messy_data.weight = messy_data.weight.str.replace("'", '') +messy_data.weight = messy_data.weight.str.replace("'", "") # And lastly, we replace all the commas that were used as decimal seperatos # in the accceleration attribute with dots: -messy_data.acceleration = messy_data.acceleration.str.replace(",", '.') +messy_data.acceleration = messy_data.acceleration.str.replace(",", ".") # the data has some zero values that the README.txt tolds us were missing # values - this was specifically for the attributes mpg and displacement, # so we're careful only to replace the zeros in these attributes, since a # zero might be correct for some other variables: -messy_data.mpg = messy_data.mpg.replace({'0': np.nan}) -messy_data.displacement = messy_data.displacement.replace({'0': np.nan}) +messy_data.mpg = messy_data.mpg.replace({"0": np.nan}) +messy_data.displacement = messy_data.displacement.replace({"0": np.nan}) # We later on find out that a value of 99 for the mpg is not value that is # within reason for the MPG of the cars in this dataset. The observations @@ -75,11 +75,11 @@ y_r = data[:, 0].copy() # Since origin is categorical variable, we can do as in previos exercises # and do a one-out-of-K encoding: -origin = np.array(X_r[:, -1], dtype=int).T-1 -K = origin.max()+1 +origin = np.array(X_r[:, -1], dtype=int).T - 1 +K = origin.max() + 1 origin_encoding = np.zeros((origin.size, K)) origin_encoding[np.arange(origin.size), origin] = 1 -X_r = np.concatenate((X_r[:, :-1], origin_encoding),axis=1) +X_r = np.concatenate((X_r[:, :-1], origin_encoding), axis=1) # Since the README.txt doesn't supply a lot of information about what the # levels in the origin variable mean, you'd have to either make an educated # guess based on the values in the context, or preferably obtain the @@ -98,7 +98,7 @@ X_r = np.concatenate((X_r[:, :-1], origin_encoding),axis=1) # to keep it mind to never do any of them blindly. Keep a record of what # you do, and consider/discuss how it might affect your modelling. -# The simplest way of handling missing values is to drop any records +# The simplest way of handling missing values is to drop any records # that display them, we do this by first determining where there are # missing values: missing_idx = np.isnan(data) @@ -110,36 +110,37 @@ data_drop_missing_obs = data[np.logical_not(obs_w_missing), :] # Another approach is to first investigate where the missing values are. # A quick way to do this is to visually look at the missing_idx: -plt.title('Visual inspection of missing values') +plt.title("Visual inspection of missing values") plt.imshow(missing_idx) -plt.ylabel('Observations'); plt.xlabel('Attributes'); +plt.ylabel("Observations") +plt.xlabel("Attributes") plt.show() # From such a plot, we can see that the issue is the third column, the # displacement attribute. This can be confirmed by e.g. doing: -#np.sum(missing_idx, 0) -# which shows that 12 observations are missing a value in the third column. -# Therefore, another way to move forward is to disregard displacement +# np.sum(missing_idx, 0) +# which shows that 12 observations are missing a value in the third column. +# Therefore, another way to move forward is to disregard displacement # (for now) and remove the attribute. We then remove the few # remaining observations with missing values: cols = np.ones((data.shape[1]), dtype=bool) cols[2] = 0 -data_wo_displacement = data[:, cols] -obs_w_missing_wo_displacement = np.sum(np.isnan(data_wo_displacement),1)>0 +data_wo_displacement = data[:, cols] +obs_w_missing_wo_displacement = np.sum(np.isnan(data_wo_displacement), 1) > 0 data_drop_disp_then_missing = data[np.logical_not(obs_w_missing_wo_displacement), :] # Now we have kept all but two of the observations. This however, doesn't # necesarrily mean that this approach is superior to the previous one, # since we have now also lost any and all information that we could have -# gotten from the displacement attribute. +# gotten from the displacement attribute. # One could impute the missing values - "guess them", in some # sense - while trying to minimize the impact of the guess. # A simply way of imputing them is to replace the missing values # with the median of the attribute. We would have to do this for the # missing values for attributes 1 and 3: -data_imputed = data.copy(); +data_imputed = data.copy() for att in [0, 2]: - # We use nanmedian to ignore the nan values + # We use nanmedian to ignore the nan values impute_val = np.nanmedian(data[:, att]) idx = missing_idx[:, att] - data_imputed[idx, att] = impute_val; + data_imputed[idx, att] = impute_val diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py index 7d21db17ab4cdba5bd8b92ae9c40ec9e31398d0f..b210d13bfaca05d29b799ca0f1cc103802dee09e 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_1.py @@ -1,9 +1,11 @@ # exercise 2.1.1 +import importlib_resources import numpy as np import xlrd # Load xls sheet with data -doc = xlrd.open_workbook('../Data/nanonose.xls').sheet_by_index(0) +filename = importlib_resources.files("dtuimldmtools").joinpath("data/nanonose.xls") +doc = xlrd.open_workbook(filename).sheet_by_index(0) # Extract attribute names (1st row, column 4 to 12) attributeNames = doc.row_values(0, 3, 11) @@ -27,4 +29,4 @@ N = len(y) M = len(attributeNames) C = len(classNames) -print('Ran Exercise 2.1.1') \ No newline at end of file +print("Ran Exercise 2.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py index 45429a5883e3efbab8410e44f5b412fb90d2ad8d..98847f60502ada91cb0b8289234d68daeb6959ba 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_2.py @@ -2,9 +2,10 @@ # Imports the numpy and xlrd package, then runs the ex2_1_1 code from ex2_1_1 import * +from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel + # (requires data structures from ex. 2.1.1) -from matplotlib.pyplot import figure, plot, title, legend, xlabel, ylabel, show # Data attributes to be plotted i = 0 @@ -14,18 +15,18 @@ j = 1 # Make a simple plot of the i'th attribute against the j'th attribute # Notice that X is of matrix type (but it will also work with a numpy array) # X = np.array(X) #Try to uncomment this line -plot(X[:, i], X[:, j], 'o') +plot(X[:, i], X[:, j], "o") # %% -# Make another more fancy plot that includes legend, class labels, +# Make another more fancy plot that includes legend, class labels, # attribute names, and a title. f = figure() -title('NanoNose data') +title("NanoNose data") for c in range(C): # select indices belonging to class c: - class_mask = y==c - plot(X[class_mask,i], X[class_mask,j], 'o',alpha=.3) + class_mask = y == c + plot(X[class_mask, i], X[class_mask, j], "o", alpha=0.3) legend(classNames) xlabel(attributeNames[i]) @@ -33,4 +34,4 @@ ylabel(attributeNames[j]) # Output result to screen show() -print('Ran Exercise 2.1.2') \ No newline at end of file +print("Ran Exercise 2.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py index 6853efb9c883bed33ede41c877b80f4d96d15605..c977a4c69b0e1a4cc311470122cbdffe59a72ae6 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_3.py @@ -1,31 +1,30 @@ # exercise 2.1.3 # (requires data structures from ex. 2.2.1) -from ex2_1_1 import * - import matplotlib.pyplot as plt +from ex2_1_1 import * from scipy.linalg import svd # Subtract mean value from data -Y = X - np.ones((N,1))*X.mean(axis=0) +Y = X - np.ones((N, 1)) * X.mean(axis=0) # PCA by computing SVD of Y -U,S,V = svd(Y,full_matrices=False) +U, S, V = svd(Y, full_matrices=False) # Compute variance explained by principal components -rho = (S*S) / (S*S).sum() +rho = (S * S) / (S * S).sum() threshold = 0.9 # Plot variance explained plt.figure() -plt.plot(range(1,len(rho)+1),rho,'x-') -plt.plot(range(1,len(rho)+1),np.cumsum(rho),'o-') -plt.plot([1,len(rho)],[threshold, threshold],'k--') -plt.title('Variance explained by principal components'); -plt.xlabel('Principal component'); -plt.ylabel('Variance explained'); -plt.legend(['Individual','Cumulative','Threshold']) +plt.plot(range(1, len(rho) + 1), rho, "x-") +plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-") +plt.plot([1, len(rho)], [threshold, threshold], "k--") +plt.title("Variance explained by principal components") +plt.xlabel("Principal component") +plt.ylabel("Variance explained") +plt.legend(["Individual", "Cumulative", "Threshold"]) plt.grid() plt.show() -print('Ran Exercise 2.1.3') \ No newline at end of file +print("Ran Exercise 2.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py index beacf3f82150ec2ed6cad78aa6c159fb3b3e41dd..46788689ab957d3409370102720978e17d89a453 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_4.py @@ -1,18 +1,17 @@ # exercise 2.1.4 # (requires data structures from ex. 2.2.1 and 2.2.3) from ex2_1_1 import * - -from matplotlib.pyplot import figure, plot, title, xlabel, ylabel, show, legend +from matplotlib.pyplot import figure, legend, plot, show, title, xlabel, ylabel from scipy.linalg import svd # Subtract mean value from data -Y = X - np.ones((N,1))*X.mean(0) +Y = X - np.ones((N, 1)) * X.mean(0) # PCA by computing SVD of Y -U,S,Vh = svd(Y,full_matrices=False) +U, S, Vh = svd(Y, full_matrices=False) # scipy.linalg.svd returns "Vh", which is the Hermitian (transpose) # of the vector V. So, for us to obtain the correct V, we transpose: -V = Vh.T +V = Vh.T # Project the centered data onto principal component space Z = Y @ V @@ -23,17 +22,17 @@ j = 1 # Plot PCA of the data f = figure() -title('NanoNose data: PCA') -#Z = array(Z) +title("NanoNose data: PCA") +# Z = array(Z) for c in range(C): # select indices belonging to class c: - class_mask = y==c - plot(Z[class_mask,i], Z[class_mask,j], 'o', alpha=.5) + class_mask = y == c + plot(Z[class_mask, i], Z[class_mask, j], "o", alpha=0.5) legend(classNames) -xlabel('PC{0}'.format(i+1)) -ylabel('PC{0}'.format(j+1)) +xlabel("PC{0}".format(i + 1)) +ylabel("PC{0}".format(j + 1)) # Output result to screen show() -print('Ran Exercise 2.1.4') \ No newline at end of file +print("Ran Exercise 2.1.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py index 322abc611d8e99cc1da99c8d88dbe46a70917ade..364a20146ce59e5f312d1d59025b73dbc1a91d6f 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_5.py @@ -1,47 +1,46 @@ # exercise 2.2.4 # (requires data structures from ex. 2.2.1) -from ex2_1_1 import * - import matplotlib.pyplot as plt +from ex2_1_1 import * from scipy.linalg import svd -Y = X - np.ones((N,1))*X.mean(0) -U,S,Vh = svd(Y,full_matrices=False) -V=Vh.T -N,M = X.shape +Y = X - np.ones((N, 1)) * X.mean(0) +U, S, Vh = svd(Y, full_matrices=False) +V = Vh.T +N, M = X.shape # We saw in 2.1.3 that the first 3 components explaiend more than 90 # percent of the variance. Let's look at their coefficients: -pcs = [0,1,2] -legendStrs = ['PC'+str(e+1) for e in pcs] -c = ['r','g','b'] -bw = .2 -r = np.arange(1,M+1) -for i in pcs: - plt.bar(r+i*bw, V[:,i], width=bw) -plt.xticks(r+bw, attributeNames) -plt.xlabel('Attributes') -plt.ylabel('Component coefficients') +pcs = [0, 1, 2] +legendStrs = ["PC" + str(e + 1) for e in pcs] +c = ["r", "g", "b"] +bw = 0.2 +r = np.arange(1, M + 1) +for i in pcs: + plt.bar(r + i * bw, V[:, i], width=bw) +plt.xticks(r + bw, attributeNames) +plt.xlabel("Attributes") +plt.ylabel("Component coefficients") plt.legend(legendStrs) plt.grid() -plt.title('NanoNose: PCA Component Coefficients') +plt.title("NanoNose: PCA Component Coefficients") plt.show() # Inspecting the plot, we see that the 2nd principal component has large # (in magnitude) coefficients for attributes A, E and H. We can confirm # this by looking at it's numerical values directly, too: -print('PC2:') -print(V[:,1].T) +print("PC2:") +print(V[:, 1].T) # How does this translate to the actual data and its projections? # Looking at the data for water: # Projection of water class onto the 2nd principal component. -all_water_data = Y[y==4,:] +all_water_data = Y[y == 4, :] -print('First water observation') -print(all_water_data[0,:]) +print("First water observation") +print(all_water_data[0, :]) # Based on the coefficients and the attribute values for the observation # displayed, would you expect the projection onto PC2 to be positive or @@ -49,6 +48,6 @@ print(all_water_data[0,:]) # coefficient and the attribute! # You can determine the projection by (remove comments): -print('...and its projection onto PC2') -print(all_water_data[0,:]@V[:,1]) -# Try to explain why? \ No newline at end of file +print("...and its projection onto PC2") +print(all_water_data[0, :] @ V[:, 1]) +# Try to explain why? diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py b/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py index 02259d96766dadbb2c207ace5869da056d56b485..78250ef25ad0a9015e9c4ce7e8ab0268f48d89f2 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_1_6.py @@ -1,101 +1,102 @@ ## exercise 2.1.6 -from ex2_1_1 import * import matplotlib.pyplot as plt +from ex2_1_1 import * from scipy.linalg import svd -r = np.arange(1,X.shape[1]+1) -plt.bar(r, np.std(X,0)) +r = np.arange(1, X.shape[1] + 1) +plt.bar(r, np.std(X, 0)) plt.xticks(r, attributeNames) -plt.ylabel('Standard deviation') -plt.xlabel('Attributes') -plt.title('NanoNose: attribute standard deviations') +plt.ylabel("Standard deviation") +plt.xlabel("Attributes") +plt.title("NanoNose: attribute standard deviations") ## Investigate how standardization affects PCA # Try this *later* (for last), and explain the effect -#X_s = X.copy() # Make a to be "scaled" version of X -#X_s[:, 2] = 100*X_s[:, 2] # Scale/multiply attribute C with a factor 100 +# X_s = X.copy() # Make a to be "scaled" version of X +# X_s[:, 2] = 100*X_s[:, 2] # Scale/multiply attribute C with a factor 100 # Use X_s instead of X to in the script below to see the difference. # Does it affect the two columns in the plot equally? # Subtract the mean from the data -Y1 = X - np.ones((N, 1))*X.mean(0) +Y1 = X - np.ones((N, 1)) * X.mean(0) # Subtract the mean from the data and divide by the attribute standard # deviation to obtain a standardized dataset: -Y2 = X - np.ones((N, 1))*X.mean(0) -Y2 = Y2*(1/np.std(Y2,0)) -# Here were utilizing the broadcasting of a row vector to fit the dimensions +Y2 = X - np.ones((N, 1)) * X.mean(0) +Y2 = Y2 * (1 / np.std(Y2, 0)) +# Here were utilizing the broadcasting of a row vector to fit the dimensions # of Y2 # Store the two in a cell, so we can just loop over them: Ys = [Y1, Y2] -titles = ['Zero-mean', 'Zero-mean and unit variance'] +titles = ["Zero-mean", "Zero-mean and unit variance"] threshold = 0.9 # Choose two PCs to plot (the projection) i = 0 j = 1 # Make the plot -plt.figure(figsize=(10,15)) -plt.subplots_adjust(hspace=.4) -plt.title('NanoNose: Effect of standardization') -nrows=3 -ncols=2 +plt.figure(figsize=(10, 15)) +plt.subplots_adjust(hspace=0.4) +plt.title("NanoNose: Effect of standardization") +nrows = 3 +ncols = 2 for k in range(2): # Obtain the PCA solution by calculate the SVD of either Y1 or Y2 - U,S,Vh = svd(Ys[k],full_matrices=False) - V=Vh.T # For the direction of V to fit the convention in the course we transpose + U, S, Vh = svd(Ys[k], full_matrices=False) + V = Vh.T # For the direction of V to fit the convention in the course we transpose # For visualization purposes, we flip the directionality of the # principal directions such that the directions match for Y1 and Y2. - if k==1: V = -V; U = -U; - + if k == 1: + V = -V + U = -U + # Compute variance explained - rho = (S*S) / (S*S).sum() - + rho = (S * S) / (S * S).sum() + # Compute the projection onto the principal components - Z = U*S; - + Z = U * S + # Plot projection - plt.subplot(nrows, ncols, 1+k) + plt.subplot(nrows, ncols, 1 + k) C = len(classNames) for c in range(C): - plt.plot(Z[y==c,i], Z[y==c,j], '.', alpha=.5) - plt.xlabel('PC'+str(i+1)) - plt.xlabel('PC'+str(j+1)) - plt.title(titles[k] + '\n' + 'Projection' ) + plt.plot(Z[y == c, i], Z[y == c, j], ".", alpha=0.5) + plt.xlabel("PC" + str(i + 1)) + plt.xlabel("PC" + str(j + 1)) + plt.title(titles[k] + "\n" + "Projection") plt.legend(classNames) - plt.axis('equal') - + plt.axis("equal") + # Plot attribute coefficients in principal component space - plt.subplot(nrows, ncols, 3+k) + plt.subplot(nrows, ncols, 3 + k) for att in range(V.shape[1]): - plt.arrow(0,0, V[att,i], V[att,j]) - plt.text(V[att,i], V[att,j], attributeNames[att]) - plt.xlim([-1,1]) - plt.ylim([-1,1]) - plt.xlabel('PC'+str(i+1)) - plt.ylabel('PC'+str(j+1)) + plt.arrow(0, 0, V[att, i], V[att, j]) + plt.text(V[att, i], V[att, j], attributeNames[att]) + plt.xlim([-1, 1]) + plt.ylim([-1, 1]) + plt.xlabel("PC" + str(i + 1)) + plt.ylabel("PC" + str(j + 1)) plt.grid() # Add a unit circle - plt.plot(np.cos(np.arange(0, 2*np.pi, 0.01)), - np.sin(np.arange(0, 2*np.pi, 0.01))); - plt.title(titles[k] +'\n'+'Attribute coefficients') - plt.axis('equal') - + plt.plot( + np.cos(np.arange(0, 2 * np.pi, 0.01)), np.sin(np.arange(0, 2 * np.pi, 0.01)) + ) + plt.title(titles[k] + "\n" + "Attribute coefficients") + plt.axis("equal") + # Plot cumulative variance explained - plt.subplot(nrows, ncols, 5+k); - plt.plot(range(1,len(rho)+1),rho,'x-') - plt.plot(range(1,len(rho)+1),np.cumsum(rho),'o-') - plt.plot([1,len(rho)],[threshold, threshold],'k--') - plt.title('Variance explained by principal components'); - plt.xlabel('Principal component'); - plt.ylabel('Variance explained'); - plt.legend(['Individual','Cumulative','Threshold']) + plt.subplot(nrows, ncols, 5 + k) + plt.plot(range(1, len(rho) + 1), rho, "x-") + plt.plot(range(1, len(rho) + 1), np.cumsum(rho), "o-") + plt.plot([1, len(rho)], [threshold, threshold], "k--") + plt.title("Variance explained by principal components") + plt.xlabel("Principal component") + plt.ylabel("Variance explained") + plt.legend(["Individual", "Cumulative", "Threshold"]) plt.grid() - plt.title(titles[k]+'\n'+'Variance explained') + plt.title(titles[k] + "\n" + "Variance explained") plt.show() - - \ No newline at end of file diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py index c7ff4f82c23371a4312b855f33406e36d4d2ab1f..798bb5b46dba276b98ff5c3f0da86e13f66bfe15 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_2_1.py @@ -1,35 +1,36 @@ -from matplotlib.pyplot import (figure, subplot, imshow, xlabel, title, -yticks, show,cm) -from scipy.io import loadmat +import importlib_resources import numpy as np +from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xlabel, yticks +from scipy.io import loadmat +filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") # Index of the digit to display i = 0 # Load Matlab data file to python dict structure -mat_data = loadmat('../Data/zipdata.mat') +mat_data = loadmat(filename) # Extract variables of interest -testdata = mat_data['testdata'] -traindata = mat_data['traindata'] -X = traindata[:,1:] -y = traindata[:,0] +testdata = mat_data["testdata"] +traindata = mat_data["traindata"] +X = traindata[:, 1:] +y = traindata[:, 0] # Visualize the i'th digit as a vector f = figure() -subplot(4,1,4) -imshow(np.expand_dims(X[i,:],axis=0), extent=(0,256,0,10), cmap=cm.gray_r) -xlabel('Pixel number') -title('Digit in vector format') +subplot(4, 1, 4) +imshow(np.expand_dims(X[i, :], axis=0), extent=(0, 256, 0, 10), cmap=cm.gray_r) +xlabel("Pixel number") +title("Digit in vector format") yticks([]) # Visualize the i'th digit as an image -subplot(2,1,1) -I = np.reshape(X[i,:],(16,16)) -imshow(I, extent=(0,16,0,16), cmap=cm.gray_r) -title('Digit as an image') +subplot(2, 1, 1) +I = np.reshape(X[i, :], (16, 16)) +imshow(I, extent=(0, 16, 0, 16), cmap=cm.gray_r) +title("Digit as an image") show() -print('Ran Exercise 2.2.1') \ No newline at end of file +print("Ran Exercise 2.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py index 123c0780513c140d24e19a64b3c5f6f1a59b461e..3d6104b2aae59b5bddd2d2afa9157d3d2e181485 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_2_2.py @@ -1,13 +1,26 @@ # exercise 2.2.2 - -from matplotlib.pyplot import (figure, subplot, plot, xlabel, ylabel, title, -yticks, show,legend,imshow, cm) -from scipy.io import loadmat -import scipy.linalg as linalg +import importlib_resources import numpy as np +import scipy.linalg as linalg +from matplotlib.pyplot import ( + cm, + figure, + imshow, + legend, + plot, + show, + subplot, + title, + xlabel, + ylabel, + yticks, +) +from scipy.io import loadmat + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") # Digits to include in analysis (to include all, n = range(10) ) -n = [0,1] +n = [0, 1] # Number of principal components for reconstruction K = 16 # Digits to visualize @@ -16,88 +29,89 @@ nD = range(6) # Load Matlab data file to python dict structure # and extract variables of interest -traindata = loadmat('../Data/zipdata.mat')['traindata'] -X = traindata[:,1:] -y = traindata[:,0] +traindata = loadmat(filename)["traindata"] +X = traindata[:, 1:] +y = traindata[:, 0] -N,M = X.shape +N, M = X.shape C = len(n) classValues = n classNames = [str(num) for num in n] -classDict = dict(zip(classNames,classValues)) +classDict = dict(zip(classNames, classValues)) # Select subset of digits classes to be inspected class_mask = np.zeros(N).astype(bool) for v in n: - cmsk = (y == v) + cmsk = y == v class_mask = class_mask | cmsk -X = X[class_mask,:] +X = X[class_mask, :] y = y[class_mask] -N=X.shape[0] +N = X.shape[0] # Center the data (subtract mean column values) -Xc = X - np.ones((N,1))*X.mean(0) +Xc = X - np.ones((N, 1)) * X.mean(0) # PCA by computing SVD of Y -U,S,V = linalg.svd(Xc,full_matrices=False) -#U = mat(U) +U, S, V = linalg.svd(Xc, full_matrices=False) +# U = mat(U) V = V.T # Compute variance explained by principal components -rho = (S*S) / (S*S).sum() +rho = (S * S) / (S * S).sum() # Project data onto principal component space Z = Xc @ V # Plot variance explained figure() -plot(rho,'o-') -title('Variance explained by principal components') -xlabel('Principal component') -ylabel('Variance explained value') +plot(rho, "o-") +title("Variance explained by principal components") +xlabel("Principal component") +ylabel("Variance explained value") # Plot PCA of the data f = figure() -title('pixel vectors of handwr. digits projected on PCs') +title("pixel vectors of handwr. digits projected on PCs") for c in n: # select indices belonging to class c: - class_mask = (y == c) - plot(Z[class_mask,0], Z[class_mask,1], 'o') + class_mask = y == c + plot(Z[class_mask, 0], Z[class_mask, 1], "o") legend(classNames) -xlabel('PC1') -ylabel('PC2') +xlabel("PC1") +ylabel("PC2") # Visualize the reconstructed data from the first K principal components # Select randomly D digits. -figure(figsize=(10,3)) -W = Z[:,range(K)] @ V[:,range(K)].T +figure(figsize=(10, 3)) +W = Z[:, range(K)] @ V[:, range(K)].T D = len(nD) for d in range(D): - digit_ix = np.random.randint(0,N) - subplot(2, D, int(d+1)) - I = np.reshape(X[digit_ix,:], (16,16)) + digit_ix = np.random.randint(0, N) + subplot(2, D, int(d + 1)) + I = np.reshape(X[digit_ix, :], (16, 16)) imshow(I, cmap=cm.gray_r) - title('Original') - subplot(2, D, D+d+1) - I = np.reshape(W[digit_ix,:]+X.mean(0), (16,16)) + title("Original") + subplot(2, D, D + d + 1) + I = np.reshape(W[digit_ix, :] + X.mean(0), (16, 16)) imshow(I, cmap=cm.gray_r) - title('Reconstr.') - + title("Reconstr.") + # Visualize the pricipal components -figure(figsize=(8,6)) +figure(figsize=(8, 6)) for k in range(K): - N1 = int(np.ceil(np.sqrt(K))); N2 = int(np.ceil(K/N1)) - subplot(N2, N1, int(k+1)) - I = np.reshape(V[:,k], (16,16)) + N1 = int(np.ceil(np.sqrt(K))) + N2 = int(np.ceil(K / N1)) + subplot(N2, N1, int(k + 1)) + I = np.reshape(V[:, k], (16, 16)) imshow(I, cmap=cm.hot) - title('PC{0}'.format(k+1)) + title("PC{0}".format(k + 1)) # output to screen show() -print('Ran Exercise 2.2.2') \ No newline at end of file +print("Ran Exercise 2.2.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py index f54d26f253f96103d8c832194f10adac1e16fa23..9f665b581d0c86e7f5ba07bf5e943e04ff0a273e 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex2_3_1.py @@ -1,30 +1,32 @@ # exercise 2.3.1 -from matplotlib.pyplot import figure, plot, xlabel, ylabel, show +import importlib_resources +import numpy as np +import scipy.linalg as linalg +from matplotlib.pyplot import figure, plot, show, xlabel, ylabel from scipy.io import loadmat from sklearn.neighbors import KNeighborsClassifier -import scipy.linalg as linalg -import numpy as np +filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") # Number of principal components to use for classification, # i.e. the reduced dimensionality -K = [8,10,15,20,30,40,50,60,100,150] +K = [8, 10, 15, 20, 30, 40, 50, 60, 100, 150] # Load Matlab data file and extract training set and test set -mat_data = loadmat('../Data/zipdata.mat') -X = mat_data['traindata'][:,1:] -y = mat_data['traindata'][:,0] -Xtest = mat_data['testdata'][:,1:] -ytest = mat_data['testdata'][:,0] -N,M = X.shape -Ntest = Xtest.shape[0] # or Xtest[:,0].shape +mat_data = loadmat(filename) +X = mat_data["traindata"][:, 1:] +y = mat_data["traindata"][:, 0] +Xtest = mat_data["testdata"][:, 1:] +ytest = mat_data["testdata"][:, 0] +N, M = X.shape +Ntest = Xtest.shape[0] # or Xtest[:,0].shape # Subtract the mean from the data -Y = X - np.ones((N,1))*X.mean(0) -Ytest = Xtest - np.ones((Ntest,1))*X.mean(0) +Y = X - np.ones((N, 1)) * X.mean(0) +Ytest = Xtest - np.ones((Ntest, 1)) * X.mean(0) # Obtain the PCA solution by calculate the SVD of Y -U,S,V = linalg.svd(Y,full_matrices=False) +U, S, V = linalg.svd(Y, full_matrices=False) V = V.T @@ -32,25 +34,25 @@ V = V.T error_rates = [] for k in K: # Project data onto principal component space, - Z = Y @ V[:,:k] - Ztest = Ytest @ V[:,:k] + Z = Y @ V[:, :k] + Ztest = Ytest @ V[:, :k] # Classify data with knn classifier knn_classifier = KNeighborsClassifier(n_neighbors=1) - knn_classifier.fit(Z,y.ravel()) + knn_classifier.fit(Z, y.ravel()) y_estimated = knn_classifier.predict(Ztest) # Compute classification error rates y_estimated = y_estimated.T - er = (sum(ytest!=y_estimated)/float(len(ytest)))*100 + er = (sum(ytest != y_estimated) / float(len(ytest))) * 100 error_rates.append(er) - print('K={0}: Error rate: {1:.1f}%'.format(k, er)) + print("K={0}: Error rate: {1:.1f}%".format(k, er)) # Visualize error rates vs. number of principal components considered figure() -plot(K,error_rates,'o-') -xlabel('Number of principal components K') -ylabel('Error rate [%]') +plot(K, error_rates, "o-") +xlabel("Number of principal components K") +ylabel("Error rate [%]") show() -print('Ran Exercise 2.3.1') \ No newline at end of file +print("Ran Exercise 2.3.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py index 661759e4ea700063225bc588e580457ea479d76a..60a39cf53d0415f6573b00f633de451559a70dff 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_2.py @@ -1,21 +1,23 @@ # exercise 3.1.4 +import importlib_resources import numpy as np from sklearn.feature_extraction.text import CountVectorizer +filename = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") # Load the textDocs.txt as a long string into raw_file: -with open('../Data/textDocs.txt', 'r') as f: +with open(filename, "r") as f: raw_file = f.read() -# raw_file contains sentences seperated by newline characters, +# raw_file contains sentences seperated by newline characters, # so we split by '\n': -corpus = raw_file.split('\n') -# corpus is now list of "documents" (sentences), but some of them are empty, +corpus = raw_file.split("\n") +# corpus is now list of "documents" (sentences), but some of them are empty, # because textDocs.txt has a lot of empty lines, we filter/remove them: corpus = list(filter(None, corpus)) # Display the result -print('Document-term matrix analysis') +print("Document-term matrix analysis") print() -print('Corpus (5 documents/sentences):') +print("Corpus (5 documents/sentences):") print(np.asmatrix(corpus)) print() @@ -23,9 +25,9 @@ print() # To automatically obtain the bag of words representation, we use sklearn's # feature_extraction.text module, which has a function CountVectorizer. # We make a CounterVectorizer: -vectorizer = CountVectorizer(token_pattern=r'\b[^\d\W]+\b') -# The token pattern is a regular expression (marked by the r), which ensures -# that the vectorizer ignores digit/non-word tokens - in this case, it ensures +vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b") +# The token pattern is a regular expression (marked by the r), which ensures +# that the vectorizer ignores digit/non-word tokens - in this case, it ensures # the 10 in the last document is not recognized as a token. It's not important # that you should understand it the regexp. @@ -34,19 +36,19 @@ vectorizer = CountVectorizer(token_pattern=r'\b[^\d\W]+\b') vectorizer.fit(corpus) # The vectorizer has now determined the unique terms (or tokens) in the corpus # and we can extract them using: -attributeNames = vectorizer.get_feature_names() -print('Found terms:') +attributeNames = vectorizer.get_feature_names_out() +print("Found terms:") print(attributeNames) print() # The next step is to count how many times each term is found in each document, # which we do using the transform function: X = vectorizer.transform(corpus) -N,M = X.shape -print('Number of documents (data objects, N):\t %i' % N) -print('Number of terms (attributes, M):\t %i' % M ) +N, M = X.shape +print("Number of documents (data objects, N):\t %i" % N) +print("Number of terms (attributes, M):\t %i" % M) print() -print('Document-term matrix:') +print("Document-term matrix:") print(X.toarray()) print() -print('Ran Exercise 3.1.2') \ No newline at end of file +print("Ran Exercise 3.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py index cfe6f69ab241a98b4f37fc1413f9e338175e5a0c..8dbc103785c16864f67a6bc1bce846eb6332f3c3 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_3.py @@ -1,37 +1,42 @@ # exercise 3.1.4 +import importlib_resources from sklearn.feature_extraction.text import CountVectorizer +filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") +filename_stop = importlib_resources.files("dtuimldmtools").joinpath( + "stopWords.txt" +) + # As before, load the corpus and preprocess: -with open('../Data/textDocs.txt', 'r') as f: +with open(filename_docs, "r") as f: raw_file = f.read() -corpus = raw_file.split('\n') +corpus = raw_file.split("\n") corpus = list(filter(None, corpus)) # Load and process the stop words in a similar manner: -with open('../Data/stopWords.txt', 'r') as f: +with open(filename_stop, "r") as f: raw_file = f.read() -stopwords = raw_file.split('\n') +stopwords = raw_file.split("\n") # When making the CountVectorizer, we now input the stop words: -vectorizer = CountVectorizer(token_pattern=r'\b[^\d\W]+\b', - stop_words=stopwords) +vectorizer = CountVectorizer(token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords) # Determine the terms in the corpus vectorizer.fit(corpus) # ... and count the frequency of each term within a document: X = vectorizer.transform(corpus) -attributeNames = vectorizer.get_feature_names() -N,M = X.shape +attributeNames = vectorizer.get_feature_names_out() +N, M = X.shape # Display the result -print('Document-term matrix analysis (using stop words)') +print("Document-term matrix analysis (using stop words)") print() -print('Number of documents (data objects, N):\t %i' % N) -print('Number of terms (attributes, M):\t %i' % M ) +print("Number of documents (data objects, N):\t %i" % N) +print("Number of terms (attributes, M):\t %i" % M) print() -print('Found terms (no stop words):') +print("Found terms (no stop words):") print(attributeNames) print() -print('Document-term matrix:') +print("Document-term matrix:") print(X.toarray()) print() -print('Ran Exercise 3.1.3') \ No newline at end of file +print("Ran Exercise 3.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py index 53ba9863e5bd7ec0723b8cc1b07e68844543f812..b71c5b3396b701e04ae7f7be58b5219dc42ede43 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_4.py @@ -1,55 +1,68 @@ # exercise 3.1.4 -from sklearn.feature_extraction.text import CountVectorizer +import importlib_resources # We'll use a widely used stemmer based: # Porter, M. “An algorithm for suffix stripping.†Program 14.3 (1980): 130-137. # The stemmer is implemented in the most used natural language processing # package in Python, "Natural Langauge Toolkit" (NLTK): from nltk.stem import PorterStemmer +from sklearn.feature_extraction.text import CountVectorizer + +filename_docs = importlib_resources.files("dtuimldmtools").joinpath("data/textDocs.txt") +filename_stop = importlib_resources.files("dtuimldmtools").joinpath( + "stopWords.txt" +) -# Load and process the corpus and stop words: -with open('../Data/textDocs.txt', 'r') as f: +# As before, load the corpus and preprocess: +with open(filename_docs, "r") as f: raw_file = f.read() -corpus = raw_file.split('\n') +corpus = raw_file.split("\n") corpus = list(filter(None, corpus)) -with open('../Data/stopWords.txt', 'r') as f: +# Load and process the stop words in a similar manner: +with open(filename_stop, "r") as f: raw_file = f.read() -stopwords = raw_file.split('\n') +stopwords = raw_file.split("\n") -# To enable stemming when using the sklearn-module, we need to parse an -# "analyzer" to the vectorizer we've been using. + +# To enable stemming when using the sklearn-module, we need to parse an +# "analyzer" to the vectorizer we've been using. # First, we make an object based on the PorterStemmer class, and we also make # an analyzer object: stemmer = PorterStemmer() -analyzer = CountVectorizer(token_pattern=r'\b[^\d\W]+\b', - stop_words=stopwords).build_analyzer() +analyzer = CountVectorizer( + token_pattern=r"\b[^\d\W]+\b", stop_words=stopwords +).build_analyzer() + + # Using these we'll make a function that can stem words: def stemmed_words(doc): return (stemmer.stem(w) for w in analyzer(doc)) + + # ... and finally, we make a vectorizer just like we've done before: -vectorizer = CountVectorizer(analyzer=stemmed_words) +vectorizer = CountVectorizer(analyzer=stemmed_words) # Determine the terms: vectorizer.fit(corpus) -attributeNames = vectorizer.get_feature_names() +attributeNames = vectorizer.get_feature_names_out() # ... and count the occurences: X = vectorizer.transform(corpus) -N,M = X.shape +N, M = X.shape X = X.toarray() # Display the result -print('Document-term matrix analysis (using stop words and stemming)') +print("Document-term matrix analysis (using stop words and stemming)") print() -print('Number of documents (data objects, N):\t %i' % N) -print('Number of terms (attributes, M):\t %i' % M ) +print("Number of documents (data objects, N):\t %i" % N) +print("Number of terms (attributes, M):\t %i" % M) print() -print('Found terms (no stop words, stemmed):') +print("Found terms (no stop words, stemmed):") print(attributeNames) print() -print('Document-term matrix:') +print("Document-term matrix:") print(X) print() -print('Ran Exercise 3.1.4') -print() \ No newline at end of file +print("Ran Exercise 3.1.4") +print() diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py index 9be9bd0b2aaf6f010dddbff279e038c575aaf8c8..d28d50af788bc9a8e83f8db8e2461ce49ba0e7a8 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_1_5.py @@ -1,35 +1,38 @@ # exercise 3.1.5 import numpy as np import scipy.linalg as linalg -from toolbox_02450.similarity import similarity - from ex3_1_4 import * +from dtuimldmtools import similarity + # Query vector q = np.array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]) # notice, that you could get the query vector using the vectorizer, too: -#q = vectorizer.transform(['matrix rank solv']) -#q = np.asarray(q.toarray()) +# q = vectorizer.transform(['matrix rank solv']) +# q = np.asarray(q.toarray()) # or use any other string: -#q = vectorizer.transform(['Can I Google how to fix my problem?']) -#q = np.asarray(q.toarray()) +# q = vectorizer.transform(['Can I Google how to fix my problem?']) +# q = np.asarray(q.toarray()) # Method 1 ('for' loop - slow) -N = np.shape(X)[0]; # get the number of data objects -sim = np.zeros((N,1)) # allocate a vector for the similarity +N = np.shape(X)[0] +# get the number of data objects +sim = np.zeros((N, 1)) # allocate a vector for the similarity for i in range(N): - x = X[i,:] # Get the i'th data object (here: document) - sim[i] = q/linalg.norm(q) @ x.T/linalg.norm(x) # Compute cosine similarity + x = X[i, :] # Get the i'th data object (here: document) + sim[i] = q / linalg.norm(q) @ x.T / linalg.norm(x) # Compute cosine similarity # Method 2 (one line of code with no iterations - faster) -sim = (q @ X.T).T / (np.sqrt(np.power(X,2).sum(axis=1)) * np.sqrt(np.power(q,2).sum())) +sim = (q @ X.T).T / ( + np.sqrt(np.power(X, 2).sum(axis=1)) * np.sqrt(np.power(q, 2).sum()) +) # Method 3 (use the "similarity" function) -sim = similarity(X, q, 'cos'); +sim = similarity(X, q, "cos") # Display the result -print('Query vector:\n {0}\n'.format(q)) -print('Similarity results:\n {0}'.format(sim)) +print("Query vector:\n {0}\n".format(q)) +print("Similarity results:\n {0}".format(sim)) -print('Ran Exercise 3.1.5') \ No newline at end of file +print("Ran Exercise 3.1.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py index aef67d775b93f363ea3d4ad0cf6ae177e939d19a..dbc1dc0af5914508cd15466c199cdb24c2fd3bc2 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_2_1.py @@ -7,13 +7,13 @@ x = np.array([-0.68, -2.11, 2.39, 0.26, 1.46, 1.33, 1.03, -0.41, -0.33, 0.47]) mean_x = x.mean() std_x = x.std(ddof=1) median_x = np.median(x) -range_x = x.max()-x.min() +range_x = x.max() - x.min() # Display results -print('Vector:',x) -print('Mean:',mean_x) -print('Standard Deviation:',std_x) -print('Median:',median_x) -print('Range:',range_x) +print("Vector:", x) +print("Mean:", mean_x) +print("Standard Deviation:", std_x) +print("Median:", median_x) +print("Range:", range_x) -print('Ran Exercise 3.2.1') \ No newline at end of file +print("Ran Exercise 3.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py index 5ed8dd5aae0919ecc1b2a799beee4bb62710fab0..f0fca9634def84749f1819bfc78cf58cac693c1f 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_3_1.py @@ -1,74 +1,84 @@ # exercise 3.3.1 +import importlib_resources import matplotlib.pyplot as plt import numpy as np from scipy.io import loadmat -from toolbox_02450.similarity import similarity +from dtuimldmtools import similarity + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/digits.mat") # Image to use as query i = 1 -# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' -similarity_measure = 'SMC' +# Similarity: 'SMC', 'Jaccard', 'ExtendedJaccard', 'Cosine', 'Correlation' +similarity_measure = "SMC" # Load the digits # Load Matlab data file to python dict structure -X = loadmat('../Data/digits.mat')['X'] +X = loadmat(filename)["X"] # You can also try the CBCL faces dataset (remember to change 'transpose') -#X = loadmat('../Data/wildfaces_grayscale.mat')['X'] +# X = loadmat('../Data/wildfaces_grayscale.mat')['X'] N, M = X.shape -transpose = False # should the plotted images be transposed? +transpose = False # should the plotted images be transposed? -# Search the face database for similar faces +# Search the face database for similar faces # Index of all other images than i -noti = list(range(0,i)) + list(range(i+1,N)) +noti = list(range(0, i)) + list(range(i + 1, N)) # Compute similarity between image i and all others -sim = similarity(X[i,:], X[noti,:], similarity_measure) +sim = similarity(X[i, :], X[noti, :], similarity_measure) sim = sim.tolist()[0] # Tuples of sorted similarities and their indices -sim_to_index = sorted(zip(sim,noti)) +sim_to_index = sorted(zip(sim, noti)) # Visualize query image and 5 most/least similar images -plt.figure(figsize=(12,8)) -plt.subplot(3,1,1) +plt.figure(figsize=(12, 8)) +plt.subplot(3, 1, 1) img_hw = int(np.sqrt(len(X[0]))) -img = np.reshape(X[i], (img_hw,img_hw)) -if transpose: img = img.T +img = np.reshape(X[i], (img_hw, img_hw)) +if transpose: + img = img.T plt.imshow(img, cmap=plt.cm.gray) -plt.xticks([]); plt.yticks([]) -plt.title('Query image') -plt.ylabel('image #{0}'.format(i)) +plt.xticks([]) +plt.yticks([]) +plt.title("Query image") +plt.ylabel("image #{0}".format(i)) for ms in range(5): - # 5 most similar images found - plt.subplot(3,5,6+ms) - im_id = sim_to_index[-ms-1][1] - im_sim = sim_to_index[-ms-1][0] - img = np.reshape(X[im_id],(img_hw,img_hw)) - if transpose: img = img.T + plt.subplot(3, 5, 6 + ms) + im_id = sim_to_index[-ms - 1][1] + im_sim = sim_to_index[-ms - 1][0] + img = np.reshape(X[im_id], (img_hw, img_hw)) + if transpose: + img = img.T plt.imshow(img, cmap=plt.cm.gray) - plt.xlabel('sim={0:.3f}'.format(im_sim)) - plt.ylabel('image #{0}'.format(im_id)) - plt.xticks([]); plt.yticks([]) - if ms==2: plt.title('Most similar images') + plt.xlabel("sim={0:.3f}".format(im_sim)) + plt.ylabel("image #{0}".format(im_id)) + plt.xticks([]) + plt.yticks([]) + if ms == 2: + plt.title("Most similar images") # 5 least similar images found - plt.subplot(3,5,11+ms) + plt.subplot(3, 5, 11 + ms) im_id = sim_to_index[ms][1] im_sim = sim_to_index[ms][0] - img = np.reshape(X[im_id],(img_hw,img_hw)) - if transpose: img = img.T + img = np.reshape(X[im_id], (img_hw, img_hw)) + if transpose: + img = img.T plt.imshow(img, cmap=plt.cm.gray) - plt.xlabel('sim={0:.3f}'.format(im_sim)) - plt.ylabel('image #{0}'.format(im_id)) - plt.xticks([]); plt.yticks([]) - if ms==2: plt.title('Least similar images') - + plt.xlabel("sim={0:.3f}".format(im_sim)) + plt.ylabel("image #{0}".format(im_id)) + plt.xticks([]) + plt.yticks([]) + if ms == 2: + plt.title("Least similar images") + plt.show() -print('Ran Exercise 3.3.1') \ No newline at end of file +print("Ran Exercise 3.3.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py index 889a14d4199496a9823fa34fa59afd3147239702..e7abec8d3c302ff9414b0afb8925596c1dc69d3f 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex3_3_2.py @@ -1,23 +1,42 @@ # exercise 3.2.2 import numpy as np -from toolbox_02450.similarity import similarity + +from dtuimldmtools import similarity # Generate two data objects with M random attributes -M = 5; -x = np.random.rand(1,M) -y = np.random.rand(1,M) +M = 5 +x = np.random.rand(1, M) +y = np.random.rand(1, M) # Two constants a = 1.5 b = 1.5 # Check the statements in the exercise -print("Cosine scaling: %.4f " % (similarity(x,y,'cos') - similarity(a*x,y,'cos'))[0,0]) -print("ExtendedJaccard scaling: %.4f " % (similarity(x,y,'ext') - similarity(a*x,y,'ext'))[0,0]) -print("Correlation scaling: %.4f " % (similarity(x,y,'cor') - similarity(a*x,y,'cor'))[0,0]) -print("Cosine translation: %.4f " % (similarity(x,y,'cos') - similarity(b+x,y,'cos'))[0,0]) -print("ExtendedJaccard translation: %.4f " % (similarity(x,y,'ext') - similarity(b+x,y,'ext'))[0,0]) -print("Correlation translation: %.4f " % (similarity(x,y,'cor') - similarity(b+x,y,'cor'))[0,0]) +print( + "Cosine scaling: %.4f " + % (similarity(x, y, "cos") - similarity(a * x, y, "cos"))[0, 0] +) +print( + "ExtendedJaccard scaling: %.4f " + % (similarity(x, y, "ext") - similarity(a * x, y, "ext"))[0, 0] +) +print( + "Correlation scaling: %.4f " + % (similarity(x, y, "cor") - similarity(a * x, y, "cor"))[0, 0] +) +print( + "Cosine translation: %.4f " + % (similarity(x, y, "cos") - similarity(b + x, y, "cos"))[0, 0] +) +print( + "ExtendedJaccard translation: %.4f " + % (similarity(x, y, "ext") - similarity(b + x, y, "ext"))[0, 0] +) +print( + "Correlation translation: %.4f " + % (similarity(x, y, "cor") - similarity(b + x, y, "cor"))[0, 0] +) -print('Ran Exercise 3.2.2') \ No newline at end of file +print("Ran Exercise 3.2.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py index 94d8b0bbf55bc290343e3319fdff71ca947470ac..e3a77ebc106a018ad928c30c1da83736957aa782 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_1.py @@ -1,8 +1,7 @@ # exercise 4.1.1 -from matplotlib.pyplot import (figure, title, subplot, plot, hist, show) import numpy as np - +from matplotlib.pyplot import figure, hist, plot, show, subplot, title # Number of samples N = 200 @@ -17,17 +16,17 @@ s = 2 nbins = 20 # Generate samples from the Normal distribution -X = np.random.normal(mu,s,N).T +X = np.random.normal(mu, s, N).T # or equally: X = np.random.randn(N).T * s + mu # Plot the samples and histogram -figure(figsize=(12,4)) -title('Normal distribution') -subplot(1,2,1) -plot(X,'.') -subplot(1,3,3) +figure(figsize=(12, 4)) +title("Normal distribution") +subplot(1, 2, 1) +plot(X, ".") +subplot(1, 3, 3) hist(X, bins=nbins) show() -print('Ran Exercise 4.1.1') \ No newline at end of file +print("Ran Exercise 4.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex4_1_2.py index 77949b22e164ab3557c00b3493a8ac0c02e8a52e..f2fc64f623da810e918b6e835d6f0eadb6f8a2c5 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_2.py @@ -1,7 +1,8 @@ # exercise 4.1.2 -from matplotlib.pyplot import (figure, title, subplot, plot, hist, show) import numpy as np +from matplotlib.pyplot import figure, hist, plot, show, subplot, title + # Number of samples N = 200 @@ -15,16 +16,16 @@ s = 2 nbins = 20 # Generate samples from the Normal distribution -X = np.random.normal(mu,s,N).T +X = np.random.normal(mu, s, N).T # or equally: X = np.random.randn(N).T * s + mu # Plot the samples and histogram figure() -title('Normal distribution') -subplot(1,2,1) -plot(X,'x') -subplot(1,2,2) +title("Normal distribution") +subplot(1, 2, 1) +plot(X, "x") +subplot(1, 2, 2) hist(X, bins=nbins) # Compute empirical mean and standard deviation @@ -38,4 +39,4 @@ print("Empirical std.dev.: ", s_) show() -print('Ran Exercise 4.1.2') \ No newline at end of file +print("Ran Exercise 4.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex4_1_3.py index 719dfc02e815303cfb6caa270bf2d69e42859c6a..04eda67169c6492e3b67bc20de521878f602f730 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_3.py @@ -1,7 +1,7 @@ # exercise 4.1.3 -from matplotlib.pyplot import (figure, title, subplot, plot, hist, show) import numpy as np +from matplotlib.pyplot import figure, hist, plot, show, subplot, title from scipy import stats # Number of samples @@ -17,19 +17,19 @@ s = 2 nbins = 20 # Generate samples from the Normal distribution -X = np.random.normal(mu,s,N).T +X = np.random.normal(mu, s, N).T # or equally: X = np.random.randn(N).T * s + mu # Plot the histogram f = figure() -title('Normal distribution') +title("Normal distribution") hist(X, bins=nbins, density=True) # Over the histogram, plot the theoretical probability distribution function: x = np.linspace(X.min(), X.max(), 1000) -pdf = stats.norm.pdf(x,loc=17,scale=2) -plot(x,pdf,'.',color='red') +pdf = stats.norm.pdf(x, loc=17, scale=2) +plot(x, pdf, ".", color="red") # Compute empirical mean and standard deviation mu_ = X.mean() @@ -42,4 +42,4 @@ print("Empirical std.dev.: ", s_) show() -print('Ran Exercise 4.1.3') \ No newline at end of file +print("Ran Exercise 4.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex4_1_4.py index b451960a83613178f34266885c8ae840287405a7..42e7588f9653eb0be39a2a696d8f80a3afb6ccf4 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_4.py @@ -9,9 +9,9 @@ N = 1000 mu = np.array([13, 17]) # Covariance matrix -S = np.array([[4,3],[3,9]]) +S = np.array([[4, 3], [3, 9]]) # Generate samples from the Normal distribution X = np.random.multivariate_normal(mu, S, N) -print('Ran Exercise 4.1.4') \ No newline at end of file +print("Ran Exercise 4.1.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex4_1_5.py index 06e5d131cc0d53506392d06a225da63d32181c88..90bfd4b6807a3b112c2cc747622eebf2a49da889 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_5.py @@ -1,9 +1,22 @@ # exercise 4.1.5 -from matplotlib.pyplot import (figure, title, subplot, plot, hist, show, - xlabel, ylabel, xticks, yticks, colorbar, cm, - imshow, suptitle) import numpy as np +from matplotlib.pyplot import ( + cm, + colorbar, + figure, + hist, + imshow, + plot, + show, + subplot, + suptitle, + title, + xlabel, + xticks, + ylabel, + yticks, +) # Number of samples N = 1000 @@ -18,7 +31,7 @@ s2 = 3 corr = 0.5 # Covariance matrix -S = np.matrix([[s1*s1, corr*s1*s2], [corr*s1*s2, s2*s2]]) +S = np.matrix([[s1 * s1, corr * s1 * s2], [corr * s1 * s2, s2 * s2]]) # Mean mu = np.array([13, 17]) @@ -31,21 +44,25 @@ X = np.random.multivariate_normal(mu, S, N) # Plot scatter plot of data -figure(figsize=(12,8)) -suptitle('2-D Normal distribution') +figure(figsize=(12, 8)) +suptitle("2-D Normal distribution") -subplot(1,2,1) -plot(X[:,0], X[:,1], 'x') -xlabel('x1'); ylabel('x2') -title('Scatter plot of data') +subplot(1, 2, 1) +plot(X[:, 0], X[:, 1], "x") +xlabel("x1") +ylabel("x2") +title("Scatter plot of data") -subplot(1,2,2) -x = np.histogram2d(X[:,0], X[:,1], nbins) -imshow(x[0], cmap=cm.gray_r, interpolation='None', origin='lower') +subplot(1, 2, 2) +x = np.histogram2d(X[:, 0], X[:, 1], nbins) +imshow(x[0], cmap=cm.gray_r, interpolation="None", origin="lower") colorbar() -xlabel('x1'); ylabel('x2'); xticks([]); yticks([]); -title('2D histogram') +xlabel("x1") +ylabel("x2") +xticks([]) +yticks([]) +title("2D histogram") show() -print('Ran Exercise 4.1.5') \ No newline at end of file +print("Ran Exercise 4.1.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_6.py b/exercises/02450Toolbox_Python/Scripts/ex4_1_6.py index 8504cb22f59367cec86792aa4c4600543a9a30fc..e74e856f3bdb6d3d6b3031aa287213100a017caa 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_6.py @@ -1,28 +1,29 @@ # exercise 4.1.6 -from matplotlib.pyplot import (figure, subplot, title, imshow, xticks, yticks, - show, cm) +import importlib_resources +import numpy as np import scipy.linalg as linalg +from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xticks, yticks from scipy.io import loadmat -import numpy as np +filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") # Digits to include in analysis (to include all: n = range(10)) n = [0] # Load Matlab data file to python dict structure # and extract variables of interest -traindata = loadmat('../Data/zipdata.mat')['traindata'] -X = traindata[:,1:] -y = traindata[:,0] +traindata = loadmat(filename)["traindata"] +X = traindata[:, 1:] +y = traindata[:, 0] N, M = X.shape C = len(n) # Remove digits that are not to be inspected class_mask = np.zeros(N).astype(bool) for v in n: - cmsk = (y==v) + cmsk = y == v class_mask = class_mask | cmsk -X = X[class_mask,:] +X = X[class_mask, :] y = y[class_mask] N = np.shape(X)[0] @@ -31,17 +32,19 @@ s = X.std(ddof=1, axis=0) S = np.cov(X, rowvar=0, ddof=1) figure() -subplot(1,2,1) -I = np.reshape(mu, (16,16)) +subplot(1, 2, 1) +I = np.reshape(mu, (16, 16)) imshow(I, cmap=cm.gray_r) -title('Mean') -xticks([]); yticks([]) -subplot(1,2,2) -I = np.reshape(s, (16,16)) +title("Mean") +xticks([]) +yticks([]) +subplot(1, 2, 2) +I = np.reshape(s, (16, 16)) imshow(I, cmap=cm.gray_r) -title('Standard deviation') -xticks([]); yticks([]) +title("Standard deviation") +xticks([]) +yticks([]) show() -print('Ran Exercise 4.1.6') \ No newline at end of file +print("Ran Exercise 4.1.6") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_1_7.py b/exercises/02450Toolbox_Python/Scripts/ex4_1_7.py index 93d8df180412679e3e7f91a3a73ec40e67a9c244..c6ded613d21f1cf7fa41772fbe1d8658bff95dbe 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_1_7.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_1_7.py @@ -1,10 +1,11 @@ # exercise 4.1.7 -from matplotlib.pyplot import (figure, subplot, imshow, xticks, yticks, title, - cm, show) +import importlib_resources import numpy as np +from matplotlib.pyplot import cm, figure, imshow, show, subplot, title, xticks, yticks from scipy.io import loadmat +filename = importlib_resources.files("dtuimldmtools").joinpath("data/zipdata.mat") # Digits to include in analysis (to include all, n = range(10) ) n = [1] @@ -13,43 +14,45 @@ ngen = 10 # Load Matlab data file to python dict structure # and extract variables of interest -traindata = loadmat('../Data/zipdata.mat')['traindata'] -X = traindata[:,1:] -y = traindata[:,0] -N, M = np.shape(X) #or X.shape +traindata = loadmat(filename)["traindata"] +X = traindata[:, 1:] +y = traindata[:, 0] +N, M = np.shape(X) # or X.shape C = len(n) # Remove digits that are not to be inspected class_mask = np.zeros(N).astype(bool) for v in n: - cmsk = (y==v) + cmsk = y == v class_mask = class_mask | cmsk -X = X[class_mask,:] +X = X[class_mask, :] y = y[class_mask] -N = np.shape(X)[0] # or X.shape[0] +N = np.shape(X)[0] # or X.shape[0] mu = X.mean(axis=0) s = X.std(ddof=1, axis=0) S = np.cov(X, rowvar=0, ddof=1) # Generate 10 samples from 1-D normal distribution -Xgen = np.random.randn(ngen,256) +Xgen = np.random.randn(ngen, 256) for i in range(ngen): - Xgen[i] = np.multiply(Xgen[i],s) + mu + Xgen[i] = np.multiply(Xgen[i], s) + mu # Plot images figure() for k in range(ngen): - subplot(2, int(np.ceil(ngen/2.)), k+1) - I = np.reshape(Xgen[k,:], (16,16)) - imshow(I, cmap=cm.gray_r); - xticks([]); yticks([]) - if k==1: title('Digits: 1-D Normal') + subplot(2, int(np.ceil(ngen / 2.0)), k + 1) + I = np.reshape(Xgen[k, :], (16, 16)) + imshow(I, cmap=cm.gray_r) + xticks([]) + yticks([]) + if k == 1: + title("Digits: 1-D Normal") # Generate 10 samples from multivariate normal distribution Xmvgen = np.random.multivariate_normal(mu, S, ngen) -# Note if you are investigating a single class, then you may get: +# Note if you are investigating a single class, then you may get: # """RuntimeWarning: covariance is not positive-semidefinite.""" # Which in general is troublesome, but here is due to numerical imprecission @@ -57,12 +60,14 @@ Xmvgen = np.random.multivariate_normal(mu, S, ngen) # Plot images figure() for k in range(ngen): - subplot(2, int(np.ceil(ngen/2.)), k+1) - I = np.reshape(Xmvgen[k,:], (16,16)) - imshow(I, cmap=cm.gray_r); - xticks([]); yticks([]) - if k==1: title('Digits: Multivariate Normal') + subplot(2, int(np.ceil(ngen / 2.0)), k + 1) + I = np.reshape(Xmvgen[k, :], (16, 16)) + imshow(I, cmap=cm.gray_r) + xticks([]) + yticks([]) + if k == 1: + title("Digits: Multivariate Normal") show() -print('Ran Exercise 4.1.7') \ No newline at end of file +print("Ran Exercise 4.1.7") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py index 24238d46eb0b98157b07aa38d3f75a190e019f54..e6b46e8ca7e49caa7258fa553737a598d2dda417 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_1.py @@ -1,31 +1,33 @@ # exercise 4.2.1 +import importlib_resources import numpy as np import xlrd +filename = importlib_resources.files("dtuimldmtools").joinpath("data/iris.xls") # Load xls sheet with data -doc = xlrd.open_workbook('../Data/iris.xls').sheet_by_index(0) +doc = xlrd.open_workbook(filename).sheet_by_index(0) # Extract attribute names -attributeNames = doc.row_values(0,0,4) +attributeNames = doc.row_values(0, 0, 4) # Extract class names to python list, # then encode with integers (dict) -classLabels = doc.col_values(4,1,151) +classLabels = doc.col_values(4, 1, 151) classNames = sorted(set(classLabels)) -classDict = dict(zip(classNames,range(len(classNames)))) +classDict = dict(zip(classNames, range(len(classNames)))) # Extract vector y, convert to NumPy matrix and transpose y = np.array([classDict[value] for value in classLabels]) # Preallocate memory, then extract data to matrix X -X = np.empty((150,4)) +X = np.empty((150, 4)) for i in range(4): - X[:,i] = np.array(doc.col_values(i,1,151)).T + X[:, i] = np.array(doc.col_values(i, 1, 151)).T # Compute values of N, M and C. N = len(y) M = len(attributeNames) C = len(classNames) -print('Ran Exercise 4.2.1') \ No newline at end of file +print("Ran Exercise 4.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py index 03eec5a109de704f29c8673f735441b23799df47..4bef1c35cdf2df3450ee468be245671bfa4d2d72 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_2.py @@ -1,18 +1,20 @@ # Exercise 4.2.2 -from matplotlib.pyplot import figure, subplot, hist, xlabel, ylim, show import numpy as np + # requires data from exercise 4.2.1 from ex4_2_1 import * +from matplotlib.pyplot import figure, hist, show, subplot, xlabel, ylim -figure(figsize=(8,7)) -u = np.floor(np.sqrt(M)); v = np.ceil(float(M)/u) +figure(figsize=(8, 7)) +u = np.floor(np.sqrt(M)) +v = np.ceil(float(M) / u) for i in range(M): - subplot(int(u),int(v),i+1) - hist(X[:,i], color=(0.2, 0.8-i*0.2, 0.4)) + subplot(int(u), int(v), i + 1) + hist(X[:, i], color=(0.2, 0.8 - i * 0.2, 0.4)) xlabel(attributeNames[i]) - ylim(0,N/2) - + ylim(0, N / 2) + show() -print('Ran Exercise 4.2.2') \ No newline at end of file +print("Ran Exercise 4.2.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py index 57e41b0930448c48dcacd153e0ea095898b34cdd..a6420832b94689016a5d776b2de688666e5e04e9 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_3.py @@ -1,14 +1,13 @@ # Exercise 4.2.3 -from matplotlib.pyplot import boxplot, xticks, ylabel, title, show - # requires data from exercise 4.2.1 from ex4_2_1 import * +from matplotlib.pyplot import boxplot, show, title, xticks, ylabel boxplot(X) -xticks(range(1,5),attributeNames) -ylabel('cm') -title('Fisher\'s Iris data set - boxplot') +xticks(range(1, 5), attributeNames) +ylabel("cm") +title("Fisher's Iris data set - boxplot") show() -print('Ran Exercise 4.2.3') \ No newline at end of file +print("Ran Exercise 4.2.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py index ca8ee0613ce58fff7c5ece139da6618174d53dbc..4c394b19c505b2efd72030c49035f6dec7e4073a 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_4.py @@ -1,22 +1,24 @@ # Exercise 4.2.4 -from matplotlib.pyplot import (figure, subplot, boxplot, title, xticks, ylim, - show) # requires data from exercise 4.1.1 from ex4_2_1 import * +from matplotlib.pyplot import boxplot, figure, show, subplot, title, xticks, ylim -figure(figsize=(14,7)) +figure(figsize=(14, 7)) for c in range(C): - subplot(1,C,c+1) - class_mask = (y==c) # binary mask to extract elements of class c + subplot(1, C, c + 1) + class_mask = y == c # binary mask to extract elements of class c # or: class_mask = nonzero(y==c)[0].tolist()[0] # indices of class c - - boxplot(X[class_mask,:]) - #title('Class: {0}'.format(classNames[c])) - title('Class: '+classNames[c]) - xticks(range(1,len(attributeNames)+1), [a[:7] for a in attributeNames], rotation=45) - y_up = X.max()+(X.max()-X.min())*0.1; y_down = X.min()-(X.max()-X.min())*0.1 + + boxplot(X[class_mask, :]) + # title('Class: {0}'.format(classNames[c])) + title("Class: " + classNames[c]) + xticks( + range(1, len(attributeNames) + 1), [a[:7] for a in attributeNames], rotation=45 + ) + y_up = X.max() + (X.max() - X.min()) * 0.1 + y_down = X.min() - (X.max() - X.min()) * 0.1 ylim(y_down, y_up) show() -print('Ran Exercise 4.2.4') \ No newline at end of file +print("Ran Exercise 4.2.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py index 29cffb402f7c629c11fdf2dc9d1fc48b20b198cf..67061cbd96c896a6df5018d3288a1e271a15b09a 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_5.py @@ -1,30 +1,38 @@ # Exercise 4.2.5 -from matplotlib.pyplot import (figure, subplot, plot, xlabel, ylabel, - xticks, yticks,legend,show) - # requires data from exercise 4.2.1 from ex4_2_1 import * +from matplotlib.pyplot import ( + figure, + legend, + plot, + show, + subplot, + xlabel, + xticks, + ylabel, + yticks, +) -figure(figsize=(12,10)) +figure(figsize=(12, 10)) for m1 in range(M): for m2 in range(M): - subplot(M, M, m1*M + m2 + 1) + subplot(M, M, m1 * M + m2 + 1) for c in range(C): - class_mask = (y==c) - plot(np.array(X[class_mask,m2]), np.array(X[class_mask,m1]), '.') - if m1==M-1: + class_mask = y == c + plot(np.array(X[class_mask, m2]), np.array(X[class_mask, m1]), ".") + if m1 == M - 1: xlabel(attributeNames[m2]) else: xticks([]) - if m2==0: + if m2 == 0: ylabel(attributeNames[m1]) else: yticks([]) - #ylim(0,X.max()*1.1) - #xlim(0,X.max()*1.1) + # ylim(0,X.max()*1.1) + # xlim(0,X.max()*1.1) legend(classNames) show() -print('Ran Exercise 4.2.5') \ No newline at end of file +print("Ran Exercise 4.2.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py index 387e81ee6891e3e32f8464563525d93da516ba70..1f625d2d23ad211e32fb255e4f81f3ff246db9d1 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_6.py @@ -1,20 +1,21 @@ # Exercise 4.2.6 -from matplotlib.pyplot import figure, show -from mpl_toolkits.mplot3d import Axes3D - # requires data from exercise 4.1.1 from ex4_2_1 import * +from matplotlib.pyplot import figure, show +from mpl_toolkits.mplot3d import Axes3D # Indices of the variables to plot ind = [0, 1, 2] -colors = ['blue', 'green', 'red'] +colors = ["blue", "green", "red"] f = figure() -ax = f.add_subplot(111, projection='3d') #Here the mpl_toolkits is used +ax = f.add_subplot(111, projection="3d") # Here the mpl_toolkits is used for c in range(C): - class_mask = (y==c) - s = ax.scatter(X[class_mask,ind[0]], X[class_mask,ind[1]], X[class_mask,ind[2]], c=colors[c]) + class_mask = y == c + s = ax.scatter( + X[class_mask, ind[0]], X[class_mask, ind[1]], X[class_mask, ind[2]], c=colors[c] + ) ax.view_init(30, 220) ax.set_xlabel(attributeNames[ind[0]]) @@ -23,4 +24,4 @@ ax.set_zlabel(attributeNames[ind[2]]) show() -print('Ran Exercise 4.2.6') \ No newline at end of file +print("Ran Exercise 4.2.6") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py b/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py index 747e04aa5ca85646cca76e267cbe7cc3f7295173..e34326e5e6d35f172de5329f12881cc9322dd1b1 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_2_7.py @@ -1,22 +1,30 @@ # Exercise 4.2.7 -from matplotlib.pyplot import (figure, imshow, xticks, xlabel, ylabel, title, - colorbar, cm, show) -from scipy.stats import zscore - # requires data from exercise 4.2.1 from ex4_2_1 import * +from matplotlib.pyplot import ( + cm, + colorbar, + figure, + imshow, + show, + title, + xlabel, + xticks, + ylabel, +) +from scipy.stats import zscore X_standarized = zscore(X, ddof=1) -figure(figsize=(12,6)) -imshow(X_standarized, interpolation='none', aspect=(4./N), cmap=cm.gray); +figure(figsize=(12, 6)) +imshow(X_standarized, interpolation="none", aspect=(4.0 / N), cmap=cm.gray) xticks(range(4), attributeNames) -xlabel('Attributes') -ylabel('Data objects') -title('Fisher\'s Iris data matrix') +xlabel("Attributes") +ylabel("Data objects") +title("Fisher's Iris data matrix") colorbar() show() -print('Ran Exercise 4.2.7') \ No newline at end of file +print("Ran Exercise 4.2.7") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex4_3_1.py index b165c2aa749a37b8add8ed961575c45688c4866c..646d4d52b2324f2906a586b5a3da6d05997a454e 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_3_1.py @@ -1,65 +1,83 @@ # exercise 4.3.1 -from matplotlib.pyplot import (figure, title, boxplot, xticks, subplot, hist, - xlabel, ylim, yticks, show) +import importlib_resources import numpy as np +from matplotlib.pyplot import ( + boxplot, + figure, + hist, + show, + subplot, + title, + xlabel, + xticks, + ylim, + yticks, +) from scipy.io import loadmat from scipy.stats import zscore +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -C = mat_data['C'][0,0] -M = mat_data['M'][0,0] -N = mat_data['N'][0,0] -attributeNames = [name[0][0] for name in mat_data['attributeNames']] -classNames = [cls[0][0] for cls in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +C = mat_data["C"][0, 0] +M = mat_data["M"][0, 0] +N = mat_data["N"][0, 0] +attributeNames = [name[0][0] for name in mat_data["attributeNames"]] +classNames = [cls[0][0] for cls in mat_data["classNames"]] # We start with a box plot of each attribute figure() -title('Wine: Boxplot') +title("Wine: Boxplot") boxplot(X) -xticks(range(1,M+1), attributeNames, rotation=45) +xticks(range(1, M + 1), attributeNames, rotation=45) # From this it is clear that there are some outliers in the Alcohol # attribute (10x10^14 is clearly not a proper value for alcohol content) # However, it is impossible to see the distribution of the data, because # the axis is dominated by these extreme outliers. To avoid this, we plot a # box plot of standardized data (using the zscore function). -figure(figsize=(12,6)) -title('Wine: Boxplot (standarized)') +figure(figsize=(12, 6)) +title("Wine: Boxplot (standarized)") boxplot(zscore(X, ddof=1), attributeNames) -xticks(range(1,M+1), attributeNames, rotation=45) +xticks(range(1, M + 1), attributeNames, rotation=45) # This plot reveals that there are clearly some outliers in the Volatile # acidity, Density, and Alcohol attributes, i.e. attribute number 2, 8, -# and 11. +# and 11. # Next, we plot histograms of all attributes. -figure(figsize=(14,9)) -u = np.floor(np.sqrt(M)); v = np.ceil(float(M)/u) +figure(figsize=(14, 9)) +u = np.floor(np.sqrt(M)) +v = np.ceil(float(M) / u) for i in range(M): - subplot(int(u),int(v),i+1) - hist(X[:,i]) + subplot(int(u), int(v), i + 1) + hist(X[:, i]) xlabel(attributeNames[i]) - ylim(0, N) # Make the y-axes equal for improved readability - if i%v!=0: yticks([]) - if i==0: title('Wine: Histogram') - + ylim(0, N) # Make the y-axes equal for improved readability + if i % v != 0: + yticks([]) + if i == 0: + title("Wine: Histogram") + # This confirms our belief about outliers in attributes 2, 8, and 11. -# To take a closer look at this, we next plot histograms of the +# To take a closer look at this, we next plot histograms of the # attributes we suspect contains outliers -figure(figsize=(14,9)) +figure(figsize=(14, 9)) m = [1, 7, 10] for i in range(len(m)): - subplot(1,len(m),i+1) - hist(X[:,m[i]],50) + subplot(1, len(m), i + 1) + hist(X[:, m[i]], 50) xlabel(attributeNames[m[i]]) - ylim(0, N) # Make the y-axes equal for improved readability - if i>0: yticks([]) - if i==0: title('Wine: Histogram (selected attributes)') + ylim(0, N) # Make the y-axes equal for improved readability + if i > 0: + yticks([]) + if i == 0: + title("Wine: Histogram (selected attributes)") # The histograms show that there are a few very extreme values in these @@ -69,30 +87,33 @@ for i in range(len(m)): # alcohol percentage to be somewhere between 5-20 % vol. Then we can safely # identify the following outliers, which are a factor of 10 greater than # the largest we expect. -outlier_mask = (X[:,1]>20) | (X[:,7]>10) | (X[:,10]>200) +outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200) valid_mask = np.logical_not(outlier_mask) # Finally we will remove these from the data set -X = X[valid_mask,:] +X = X[valid_mask, :] y = y[valid_mask] N = len(y) # Now, we can repeat the process to see if there are any more outliers # present in the data. We take a look at a histogram of all attributes: -figure(figsize=(14,9)) -u = np.floor(np.sqrt(M)); v = np.ceil(float(M)/u) +figure(figsize=(14, 9)) +u = np.floor(np.sqrt(M)) +v = np.ceil(float(M) / u) for i in range(M): - subplot(int(u),int(v),i+1) - hist(X[:,i]) + subplot(int(u), int(v), i + 1) + hist(X[:, i]) xlabel(attributeNames[i]) - ylim(0, N) # Make the y-axes equal for improved readability - if i%v!=0: yticks([]) - if i==0: title('Wine: Histogram (after outlier detection)') + ylim(0, N) # Make the y-axes equal for improved readability + if i % v != 0: + yticks([]) + if i == 0: + title("Wine: Histogram (after outlier detection)") # This reveals no further outliers, and we conclude that all outliers have # been detected and removed. show() -print('Ran Exercise 4.3.1') \ No newline at end of file +print("Ran Exercise 4.3.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex4_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex4_3_2.py index 495cd7c53069dbf57e58136bd1f904d8f1403d05..5667029a6b07a0012bdb4439c5ca978a80e9dcbb 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex4_3_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex4_3_2.py @@ -1,21 +1,34 @@ # exercise 4.3.2 -from matplotlib.pyplot import figure, subplot, plot, legend, show, xlabel, ylabel, xticks, yticks +import importlib_resources import numpy as np +from matplotlib.pyplot import ( + figure, + legend, + plot, + show, + subplot, + xlabel, + xticks, + ylabel, + yticks, +) from scipy.io import loadmat from scipy.stats import zscore +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine.mat') -X = mat_data['X'] -y = np.squeeze(mat_data['y']) -C = mat_data['C'][0,0] -M = mat_data['M'][0,0] -N = mat_data['N'][0,0] - -attributeNames = [name[0][0] for name in mat_data['attributeNames']] -classNames = [cls[0] for cls in mat_data['classNames'][0]] - +mat_data = loadmat(filename) +X = mat_data["X"] +y = np.squeeze(mat_data["y"]) +C = mat_data["C"][0, 0] +M = mat_data["M"][0, 0] +N = mat_data["N"][0, 0] + +attributeNames = [name[0][0] for name in mat_data["attributeNames"]] +classNames = [cls[0] for cls in mat_data["classNames"][0]] + # The histograms show that there are a few very extreme values in these # three attributes. To identify these values as outliers, we must use our # knowledge about the data set and the attributes. Say we expect volatide @@ -23,37 +36,37 @@ classNames = [cls[0] for cls in mat_data['classNames'][0]] # alcohol percentage to be somewhere between 5-20 % vol. Then we can safely # identify the following outliers, which are a factor of 10 greater than # the largest we expect. -outlier_mask = (X[:,1]>20) | (X[:,7]>10) | (X[:,10]>200) +outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200) valid_mask = np.logical_not(outlier_mask) # Finally we will remove these from the data set -X = X[valid_mask,:] +X = X[valid_mask, :] y = y[valid_mask] N = len(y) Xnorm = zscore(X, ddof=1) ## Next we plot a number of atttributes -Attributes = [1,4,5,6] +Attributes = [1, 4, 5, 6] NumAtr = len(Attributes) -figure(figsize=(12,12)) +figure(figsize=(12, 12)) for m1 in range(NumAtr): for m2 in range(NumAtr): - subplot(NumAtr, NumAtr, m1*NumAtr + m2 + 1) + subplot(NumAtr, NumAtr, m1 * NumAtr + m2 + 1) for c in range(C): - class_mask = (y==c) - plot(X[class_mask,Attributes[m2]], X[class_mask,Attributes[m1]], '.') - if m1==NumAtr-1: + class_mask = y == c + plot(X[class_mask, Attributes[m2]], X[class_mask, Attributes[m1]], ".") + if m1 == NumAtr - 1: xlabel(attributeNames[Attributes[m2]]) else: xticks([]) - if m2==0: + if m2 == 0: ylabel(attributeNames[Attributes[m1]]) else: yticks([]) - #ylim(0,X.max()*1.1) - #xlim(0,X.max()*1.1) + # ylim(0,X.max()*1.1) + # xlim(0,X.max()*1.1) legend(classNames) show() -print('Ran Exercise 4.3.2') \ No newline at end of file +print("Ran Exercise 4.3.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex5_1_1.py index ef120313b2a4750ccb46e315364d941f11323457..3b3c5e47645345a0e7c34b6ad2bacc390b05ac91 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_1_1.py @@ -4,36 +4,38 @@ import numpy as np # Names of data objects dataobjectNames = [ - 'Human', - 'Python', - 'Salmon', - 'Whale', - 'Frog', - 'Komodo dragon', - 'Bat', - 'Pigeon', - 'Cat', - 'Leopard shark', - 'Turtle', - 'Penguin', - 'Porcupine', - 'Eel', - 'Salamander', - ] + "Human", + "Python", + "Salmon", + "Whale", + "Frog", + "Komodo dragon", + "Bat", + "Pigeon", + "Cat", + "Leopard shark", + "Turtle", + "Penguin", + "Porcupine", + "Eel", + "Salamander", +] # Attribute names attributeNames = [ - 'Body temperature', - 'Skin cover', - 'Gives birth', - 'Aquatic creature', - 'Aerial creature', - 'Has legs', - 'Hibernates' - ] + "Body temperature", + "Skin cover", + "Gives birth", + "Aquatic creature", + "Aerial creature", + "Has legs", + "Hibernates", +] # Attribute values -X = np.asarray(np.mat(''' +X = np.asarray( + np.mat( + """ 1 1 1 0 0 1 0; 0 2 0 0 0 0 1; 0 2 0 1 0 0 0; @@ -48,16 +50,18 @@ X = np.asarray(np.mat(''' 1 3 0 2 0 1 0; 1 5 1 0 0 1 1; 0 2 0 1 0 0 0; - 0 0 0 2 0 1 1 ''')) + 0 0 0 2 0 1 1 """ + ) +) # Class indices -y = np.asarray(np.mat('3 4 2 3 0 4 3 1 3 2 4 1 3 2 0').T).squeeze() +y = np.asarray(np.mat("3 4 2 3 0 4 3 1 3 2 4 1 3 2 0").T).squeeze() # Class names -classNames = ['Amphibian', 'Bird', 'Fish', 'Mammal', 'Reptile'] - +classNames = ["Amphibian", "Bird", "Fish", "Mammal", "Reptile"] + # Number data objects, attributes, and classes N, M = X.shape C = len(classNames) -print('Ran Exercise 5.1.1') \ No newline at end of file +print("Ran Exercise 5.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex5_1_2.py index d4f7fbdbe6673b559b98246caba98c892bf7edb0..dc3bcc42b52a6e36dc6a86fb23bbcb338899f270 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_1_2.py @@ -1,27 +1,28 @@ # exercise 5.1.2 -import numpy as np -from sklearn import tree -from platform import system from os import getcwd +from platform import system + import matplotlib.pyplot as plt -from matplotlib.image import imread +import numpy as np # requires data from exercise 5.1.1 from ex5_1_1 import * +from matplotlib.image import imread +from sklearn import tree # Fit regression tree classifier, Gini split criterion, no pruning -criterion = 'gini' +criterion = "gini" dtc = tree.DecisionTreeClassifier(criterion=criterion, min_samples_split=2) dtc = dtc.fit(X, y) # Visualize the graph (you can also inspect the generated image file in an external program) # NOTE: depending on your setup you may need to decrease or increase the figsize and DPI setting # to get a readable plot. Hint: Try to maximize the figure after it displays. -fname='tree_ex512_' + criterion + '.png' +fname = "tree_ex512_" + criterion + ".png" -fig = plt.figure(figsize=(4,4),dpi=100) -_ = tree.plot_tree(dtc, filled=False,feature_names=attributeNames) +fig = plt.figure(figsize=(4, 4), dpi=100) +_ = tree.plot_tree(dtc, filled=False, feature_names=attributeNames) plt.savefig(fname) plt.show() -print('Ran Exercise 5.1.2') \ No newline at end of file +print("Ran Exercise 5.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex5_1_3.py index bd3700d2a01532e48e28ea7f4c8556298ba5bde3..afd780e7b5a8c32cb2d61c9522010896e8361128 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_1_3.py @@ -1,33 +1,35 @@ # exercise 5.1.3 import os -import numpy as np -from sklearn import tree -from platform import system from os import getcwd +from platform import system + import matplotlib.pyplot as plt -from matplotlib.image import imread -#import graphviz -#import pydotplus +import numpy as np # requires data from exercise 5.1.1 from ex5_1_1 import * +from matplotlib.image import imread +from sklearn import tree + +# import graphviz +# import pydotplus + # Fit regression tree classifier, Gini split criterion, no pruning -criterion='gini' +criterion = "gini" # dtc = tree.DecisionTreeClassifier(criterion=criterion, min_samples_split=2) -dtc = tree.DecisionTreeClassifier(criterion=criterion, min_samples_split=1.0/N) -dtc = dtc.fit(X,y) +dtc = tree.DecisionTreeClassifier(criterion=criterion, min_samples_split=1.0 / N) +dtc = dtc.fit(X, y) # convert the tree into a png file using the Graphviz toolset -fname='tree_ex513_' + criterion + '.png' +fname = "tree_ex513_" + criterion + ".png" # Visualize the graph (you can also inspect the generated image file in an external program) # NOTE: depending on your setup you may need to decrease or increase the figsize and DPI setting # to get a useful plot. Hint: Try to maximize the figure after it displays. -fig = plt.figure(figsize=(4,4),dpi=100) -_ = tree.plot_tree(dtc, filled=False,feature_names=attributeNames) +fig = plt.figure(figsize=(4, 4), dpi=100) +_ = tree.plot_tree(dtc, filled=False, feature_names=attributeNames) plt.savefig(fname) plt.show() -print('Ran Exercise 5.1.3') - +print("Ran Exercise 5.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex5_1_4.py index a4424aa7158da8eefd25e4cb45674afc79dfb95d..74595d04f2e4a6408a21baeda42af92d0d1600da 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_1_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_1_4.py @@ -3,15 +3,15 @@ from ex5_1_2 import * # Define a new data object (a dragon) with the attributes given in the text -x = np.array([0, 2, 1, 2, 1, 1, 1]).reshape(1,-1) +x = np.array([0, 2, 1, 2, 1, 1, 1]).reshape(1, -1) # Evaluate the classification tree for the new data object x_class = dtc.predict(x)[0] # Print results -print('\nNew object attributes:') -print(dict(zip(attributeNames,x[0]))) -print('\nClassification result:') +print("\nNew object attributes:") +print(dict(zip(attributeNames, x[0]))) +print("\nClassification result:") print(classNames[x_class]) -print('Ran Exercise 5.1.4') \ No newline at end of file +print("Ran Exercise 5.1.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_1_5.py b/exercises/02450Toolbox_Python/Scripts/ex5_1_5.py index fa75dd5b94175176cc7192581913aea31a6bd21a..9ce89d96a6422ddf4ff3813baf4775c3fc72411f 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_1_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_1_5.py @@ -1,32 +1,35 @@ # exercise 5.1.5 -import numpy as np import os + +import importlib_resources +import numpy as np from scipy.io import loadmat +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine.mat") # Load Matlab data file and extract variables of interest workingDir = os.getcwd() print("Running from: " + workingDir) -mat_data = loadmat('../Data/wine.mat') -X = mat_data['X'] -y = mat_data['y'].astype(int).squeeze() -C = mat_data['C'][0,0] -M = mat_data['M'][0,0] -N = mat_data['N'][0,0] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].astype(int).squeeze() +C = mat_data["C"][0, 0] +M = mat_data["M"][0, 0] +N = mat_data["N"][0, 0] -attributeNames = [i[0][0] for i in mat_data['attributeNames']] -classNames = [j[0] for i in mat_data['classNames'] for j in i] +attributeNames = [i[0][0] for i in mat_data["attributeNames"]] +classNames = [j[0] for i in mat_data["classNames"] for j in i] # Remove outliers -outlier_mask = (X[:,1]>20) | (X[:,7]>10) | (X[:,10]>200) +outlier_mask = (X[:, 1] > 20) | (X[:, 7] > 10) | (X[:, 10] > 200) valid_mask = np.logical_not(outlier_mask) -X = X[valid_mask,:] +X = X[valid_mask, :] y = y[valid_mask] # Remove attribute 12 (Quality score) -X = X[:,0:11] +X = X[:, 0:11] attributeNames = attributeNames[0:11] # Update N and M N, M = X.shape -print('Ran Exercise 5.1.5') \ No newline at end of file +print("Ran Exercise 5.1.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_1_6.py b/exercises/02450Toolbox_Python/Scripts/ex5_1_6.py index f25d9d07a14c2eb6fb417dcb3b7195b455615d6e..6e5902032b4c4616caec4e613fa156b7f150243a 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_1_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_1_6.py @@ -1,35 +1,35 @@ # exercise 5.1.6 -import numpy as np -from sklearn import tree -from platform import system from os import getcwd +from platform import system + import matplotlib.pyplot as plt -from matplotlib.image import imread +import numpy as np # requires data from exercise 5.1.5 from ex5_1_5 import * +from matplotlib.image import imread +from sklearn import tree # Fit classification tree using, Gini split criterion, no pruning -criterion='gini' +criterion = "gini" dtc = tree.DecisionTreeClassifier(criterion=criterion, min_samples_split=100) -dtc = dtc.fit(X,y) +dtc = dtc.fit(X, y) # Visualize the graph (you can also inspect the generated image file in an external program) -# NOTE: depending on your screen resolution and setup you may need to decrease or increase -# the figsize and DPI setting to get a useful plot. -# Hint: Try to open the generated png file in an external image editor as it can be easier +# NOTE: depending on your screen resolution and setup you may need to decrease or increase +# the figsize and DPI setting to get a useful plot. +# Hint: Try to open the generated png file in an external image editor as it can be easier # to inspect outside matplotlib's figure environment. -fname='tree_ex516_' + criterion + '_wine_data.png' -fig = plt.figure(figsize=(12,12),dpi=300) -_ = tree.plot_tree(dtc, filled=False,feature_names=attributeNames) +fname = "tree_ex516_" + criterion + "_wine_data.png" +fig = plt.figure(figsize=(12, 12), dpi=300) +_ = tree.plot_tree(dtc, filled=False, feature_names=attributeNames) plt.savefig(fname) -plt.close() +plt.close() fig = plt.figure() plt.imshow(imread(fname)) -plt.axis('off') -plt.box('off') +plt.axis("off") +plt.box("off") plt.show() -print('Ran Exercise 5.1.6') - +print("Ran Exercise 5.1.6") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_1_7.py b/exercises/02450Toolbox_Python/Scripts/ex5_1_7.py index df69b8ab71233bf78471e829650763ea40f31f7f..b194e17fb511593cec7e4010da8ca1cad9d1bd6e 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_1_7.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_1_7.py @@ -3,16 +3,16 @@ from ex5_1_6 import * # Define a new data object (new type of wine) with the attributes given in the text -x = np.array([6.9, 1.09, .06, 2.1, .0061, 12, 31, .99, 3.5, .44, 12]).reshape(1,-1) +x = np.array([6.9, 1.09, 0.06, 2.1, 0.0061, 12, 31, 0.99, 3.5, 0.44, 12]).reshape(1, -1) # Evaluate the classification tree for the new data object x_class = dtc.predict(x)[0] # Print results -print('\nNew object attributes:') +print("\nNew object attributes:") for i in range(len(attributeNames)): - print('{0}: {1}'.format(attributeNames[i],x[0][i])) -print('\nClassification result:') + print("{0}: {1}".format(attributeNames[i], x[0][i])) +print("\nClassification result:") print(classNames[x_class]) -print('Ran Exercise 5.1.7') \ No newline at end of file +print("Ran Exercise 5.1.7") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex5_2_1.py index 92968f336927449d473582abaf25ca8c1dc511b4..f78c82ce7a010322a1005d0650fb1c574e60338c 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_2_1.py @@ -1,7 +1,7 @@ # exercise 5.2.1 -from matplotlib.pyplot import figure, plot, xlabel, ylabel, title, show import numpy as np +from matplotlib.pyplot import figure, plot, show, title, xlabel, ylabel # Number of data objects N = 100 @@ -11,21 +11,22 @@ X = np.array(range(N)) # Noise eps_mean, eps_std = 0, 0.1 -eps = np.array(eps_std*np.random.randn(N) + eps_mean) +eps = np.array(eps_std * np.random.randn(N) + eps_mean) # Model parameters w0 = -0.5 w1 = 0.01 # Outputs -y = w0 + w1*X + eps +y = w0 + w1 * X + eps # Make a scatter plot figure() -plot(X,y,'o') -xlabel('X'); ylabel('y') -title('Illustration of a linear relation with noise') +plot(X, y, "o") +xlabel("X") +ylabel("y") +title("Illustration of a linear relation with noise") show() -print('Ran Exercise 5.2.1') \ No newline at end of file +print("Ran Exercise 5.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex5_2_2.py index fc557f814b337c27c1f1ddfc9ee79ee78c9d3676..686fcc14c646a6a764228ec0aeb664dbe031d2ab 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_2_2.py @@ -1,37 +1,38 @@ # exercise 5.2.2 -from matplotlib.pyplot import figure, plot, xlabel, ylabel, legend, show -import sklearn.linear_model as lm import numpy as np +import sklearn.linear_model as lm +from matplotlib.pyplot import figure, legend, plot, show, xlabel, ylabel # Use dataset as in the previous exercise N = 100 -X = np.array(range(N)).reshape(-1,1) +X = np.array(range(N)).reshape(-1, 1) eps_mean, eps_std = 0, 0.1 -eps = np.array(eps_std*np.random.randn(N) + eps_mean).reshape(-1,1) +eps = np.array(eps_std * np.random.randn(N) + eps_mean).reshape(-1, 1) w0 = -0.5 w1 = 0.01 -y = w0 + w1*X + eps +y = w0 + w1 * X + eps y_true = y - eps # Fit ordinary least squares regression model model = lm.LinearRegression(fit_intercept=True) -model = model.fit(X,y) +model = model.fit(X, y) # Compute model output: y_est = model.predict(X) # Or equivalently: -#y_est = model.intercept_ + X @ model.coef_ +# y_est = model.intercept_ + X @ model.coef_ # Plot original data and the model output f = figure() -plot(X,y,'.') -plot(X,y_true,'-') -plot(X,y_est,'-') -xlabel('X'); ylabel('y') -legend(['Training data', 'Data generator', 'Regression fit (model)']) +plot(X, y, ".") +plot(X, y_true, "-") +plot(X, y_est, "-") +xlabel("X") +ylabel("y") +legend(["Training data", "Data generator", "Regression fit (model)"]) show() -print('Ran Exercise 5.2.2') \ No newline at end of file +print("Ran Exercise 5.2.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_2_3.py b/exercises/02450Toolbox_Python/Scripts/ex5_2_3.py index 86e82a29af80179cbf4e5ec4a9b0c8f76bcf8c4c..172a1ced9414d5466ecf046135fefee707d6e726 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_2_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_2_3.py @@ -1,47 +1,56 @@ # exercise 5.2.3 -from matplotlib.pyplot import figure, plot, xlabel, ylabel, legend, show, ylim import numpy as np import sklearn.linear_model as lm +from matplotlib.pyplot import figure, legend, plot, show, xlabel, ylabel, ylim # Parameters Kd = 5 # no of terms for data generator Km = 3 # no of terms for regression model N = 50 # no of data objects to train a model -Xe = np.linspace(-2,2,1000).reshape(-1,1) # X values to visualize true data and model -eps_mean, eps_std = 0, 0.5 # noise parameters +Xe = np.linspace(-2, 2, 1000).reshape( + -1, 1 +) # X values to visualize true data and model +eps_mean, eps_std = 0, 0.5 # noise parameters # Generate dataset (with noise) -X = np.linspace(-2,2,N).reshape(-1,1) -Xd = np.power(X, range(1,Kd+1)) -eps = (eps_std*np.random.randn(N) + eps_mean) -w = -np.power(-.9, range(1,Kd+2)) -y = w[0] + Xd @ w[1:] + eps +X = np.linspace(-2, 2, N).reshape(-1, 1) +Xd = np.power(X, range(1, Kd + 1)) +eps = eps_std * np.random.randn(N) + eps_mean +w = -np.power(-0.9, range(1, Kd + 2)) +y = w[0] + Xd @ w[1:] + eps # True data generator (assuming no noise) -Xde = np.power(Xe, range(1,Kd+1)) +Xde = np.power(Xe, range(1, Kd + 1)) y_true = w[0] + Xde @ w[1:] - # Fit ordinary least squares regression model -Xm = np.power(X, range(1,Km+1)) +Xm = np.power(X, range(1, Km + 1)) model = lm.LinearRegression() -model = model.fit(Xm,y) +model = model.fit(Xm, y) # Predict values -Xme = np.power(Xe, range(1,Km+1)) +Xme = np.power(Xe, range(1, Km + 1)) y_est = model.predict(Xme) # Plot original data and the model output f = figure() -plot(X,y,'.') -plot(Xe,y_true,'-') -plot(Xe,y_est,'-') -xlabel('X'); ylabel('y'); ylim(-2,8) -legend(['Training data', 'Data generator K={0}'.format(Kd), 'Regression fit (model) K={0}'.format(Km)]) +plot(X, y, ".") +plot(Xe, y_true, "-") +plot(Xe, y_est, "-") +xlabel("X") +ylabel("y") +ylim(-2, 8) +legend( + [ + "Training data", + "Data generator K={0}".format(Kd), + "Regression fit (model) K={0}".format(Km), + ] +) show() -print('Ran Exercise 5.2.3') \ No newline at end of file +print("Ran Exercise 5.2.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_2_4.py b/exercises/02450Toolbox_Python/Scripts/ex5_2_4.py index 6d4243e7144fb03d52dd3edbd65809bc877b0ae2..0790af6dbc80fc6c366a257ea8d2c4c0eddc6e9c 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_2_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_2_4.py @@ -1,33 +1,34 @@ # exercise 5.2.4 -from matplotlib.pylab import figure, subplot, plot, xlabel, ylabel, hist, show import sklearn.linear_model as lm # requires wine data from exercise 5.1.5 from ex5_1_5 import * +from matplotlib.pylab import figure, hist, plot, show, subplot, xlabel, ylabel # Split dataset into features and target vector -alcohol_idx = attributeNames.index('Alcohol') -y = X[:,alcohol_idx] +alcohol_idx = attributeNames.index("Alcohol") +y = X[:, alcohol_idx] -X_cols = list(range(0,alcohol_idx)) + list(range(alcohol_idx+1,len(attributeNames))) -X = X[:,X_cols] +X_cols = list(range(0, alcohol_idx)) + list(range(alcohol_idx + 1, len(attributeNames))) +X = X[:, X_cols] # Fit ordinary least squares regression model model = lm.LinearRegression() -model.fit(X,y) +model.fit(X, y) # Predict alcohol content y_est = model.predict(X) -residual = y_est-y +residual = y_est - y # Display scatter plot figure() -subplot(2,1,1) -plot(y, y_est, '.') -xlabel('Alcohol content (true)'); ylabel('Alcohol content (estimated)'); -subplot(2,1,2) -hist(residual,40) +subplot(2, 1, 1) +plot(y, y_est, ".") +xlabel("Alcohol content (true)") +ylabel("Alcohol content (estimated)") +subplot(2, 1, 2) +hist(residual, 40) show() -print('Ran Exercise 5.2.4') \ No newline at end of file +print("Ran Exercise 5.2.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_2_5.py b/exercises/02450Toolbox_Python/Scripts/ex5_2_5.py index e877d321e5e132801774d9531b6cf7bf199a45b7..8e44343b6b81236db68eb2642651834cbeaaca69 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_2_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_2_5.py @@ -1,56 +1,59 @@ # exercise 5.2.5 -from matplotlib.pylab import figure, plot, subplot, xlabel, ylabel, hist, show import sklearn.linear_model as lm # requires data from exercise 5.1.4 from ex5_1_5 import * - +from matplotlib.pylab import figure, hist, plot, show, subplot, xlabel, ylabel # Split dataset into features and target vector -alcohol_idx = attributeNames.index('Alcohol') -y = X[:,alcohol_idx] +alcohol_idx = attributeNames.index("Alcohol") +y = X[:, alcohol_idx] -X_cols = list(range(0,alcohol_idx)) + list(range(alcohol_idx+1,len(attributeNames))) -X = X[:,X_cols] +X_cols = list(range(0, alcohol_idx)) + list(range(alcohol_idx + 1, len(attributeNames))) +X = X[:, X_cols] # Additional nonlinear attributes -fa_idx = attributeNames.index('Fixed acidity') -va_idx = attributeNames.index('Volatile acidity') -Xfa2 = np.power(X[:,fa_idx],2).reshape(-1,1) -Xva2 = np.power(X[:,va_idx],2).reshape(-1,1) -Xfava = (X[:,fa_idx]*X[:,va_idx]).reshape(-1,1) -X = np.asarray(np.bmat('X, Xfa2, Xva2, Xfava')) +fa_idx = attributeNames.index("Fixed acidity") +va_idx = attributeNames.index("Volatile acidity") +Xfa2 = np.power(X[:, fa_idx], 2).reshape(-1, 1) +Xva2 = np.power(X[:, va_idx], 2).reshape(-1, 1) +Xfava = (X[:, fa_idx] * X[:, va_idx]).reshape(-1, 1) +X = np.asarray(np.bmat("X, Xfa2, Xva2, Xfava")) # Fit ordinary least squares regression model model = lm.LinearRegression() -model.fit(X,y) +model.fit(X, y) # Predict alcohol content y_est = model.predict(X) -residual = y_est-y +residual = y_est - y # Display plots -figure(figsize=(12,8)) +figure(figsize=(12, 8)) -subplot(2,1,1) -plot(y, y_est, '.g') -xlabel('Alcohol content (true)'); ylabel('Alcohol content (estimated)') +subplot(2, 1, 1) +plot(y, y_est, ".g") +xlabel("Alcohol content (true)") +ylabel("Alcohol content (estimated)") -subplot(4,1,3) -hist(residual,40) +subplot(4, 1, 3) +hist(residual, 40) -subplot(4,3,10) -plot(Xfa2, residual, '.r') -xlabel('Fixed Acidity ^2'); ylabel('Residual') +subplot(4, 3, 10) +plot(Xfa2, residual, ".r") +xlabel("Fixed Acidity ^2") +ylabel("Residual") -subplot(4,3,11) -plot(Xva2, residual, '.r') -xlabel('Volatile Acidity ^2'); ylabel('Residual') +subplot(4, 3, 11) +plot(Xva2, residual, ".r") +xlabel("Volatile Acidity ^2") +ylabel("Residual") -subplot(4,3,12) -plot(Xfava, residual, '.r') -xlabel('Fixed*Volatile Acidity'); ylabel('Residual') +subplot(4, 3, 12) +plot(Xfava, residual, ".r") +xlabel("Fixed*Volatile Acidity") +ylabel("Residual") show() -print('Ran Exercise 5.2.5') \ No newline at end of file +print("Ran Exercise 5.2.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex5_2_6.py b/exercises/02450Toolbox_Python/Scripts/ex5_2_6.py index ae4c5560eff6b241bf345ac759517da181c34ae9..bc1af0024eba48832993e2004ba8c44dd7884a27 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex5_2_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex5_2_6.py @@ -1,40 +1,41 @@ # exercise 5.2.6 -from matplotlib.pylab import figure, plot, xlabel, ylabel, legend, ylim, show import sklearn.linear_model as lm # requires data from exercise 5.1.4 from ex5_1_5 import * +from matplotlib.pylab import figure, legend, plot, show, xlabel, ylabel, ylim # Fit logistic regression model model = lm.LogisticRegression() -model = model.fit(X,y) +model = model.fit(X, y) # Classify wine as White/Red (0/1) and assess probabilities y_est = model.predict(X) -y_est_white_prob = model.predict_proba(X)[:, 0] +y_est_white_prob = model.predict_proba(X)[:, 0] # Define a new data object (new type of wine), as in exercise 5.1.7 -x = np.array([6.9, 1.09, .06, 2.1, .0061, 12, 31, .99, 3.5, .44, 12]).reshape(1,-1) -# Evaluate the probability of x being a white wine (class=0) -x_class = model.predict_proba(x)[0,0] +x = np.array([6.9, 1.09, 0.06, 2.1, 0.0061, 12, 31, 0.99, 3.5, 0.44, 12]).reshape(1, -1) +# Evaluate the probability of x being a white wine (class=0) +x_class = model.predict_proba(x)[0, 0] # Evaluate classifier's misclassification rate over entire training data misclass_rate = np.sum(y_est != y) / float(len(y_est)) # Display classification results -print('\nProbability of given sample being a white wine: {0:.4f}'.format(x_class)) -print('\nOverall misclassification rate: {0:.3f}'.format(misclass_rate)) - -f = figure(); -class0_ids = np.nonzero(y==0)[0].tolist() -plot(class0_ids, y_est_white_prob[class0_ids], '.y') -class1_ids = np.nonzero(y==1)[0].tolist() -plot(class1_ids, y_est_white_prob[class1_ids], '.r') -xlabel('Data object (wine sample)'); ylabel('Predicted prob. of class White'); -legend(['White', 'Red']) -ylim(-0.01,1.5) +print("\nProbability of given sample being a white wine: {0:.4f}".format(x_class)) +print("\nOverall misclassification rate: {0:.3f}".format(misclass_rate)) + +f = figure() +class0_ids = np.nonzero(y == 0)[0].tolist() +plot(class0_ids, y_est_white_prob[class0_ids], ".y") +class1_ids = np.nonzero(y == 1)[0].tolist() +plot(class1_ids, y_est_white_prob[class1_ids], ".r") +xlabel("Data object (wine sample)") +ylabel("Predicted prob. of class White") +legend(["White", "Red"]) +ylim(-0.01, 1.5) show() -print('Ran Exercise 5.2.6') \ No newline at end of file +print("Ran Exercise 5.2.6") diff --git a/exercises/02450Toolbox_Python/Scripts/ex6_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex6_1_1.py index 8915ac6a25802e073a83c8a5f537fdd00a1adaf1..5103b044bbea6b0879452bdb3e41f0b6cf38a101 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex6_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex6_1_1.py @@ -1,16 +1,19 @@ # exercise 6.1.1 -from matplotlib.pylab import figure, plot, xlabel, ylabel, legend, show +import importlib_resources +import numpy as np +from matplotlib.pylab import figure, legend, plot, show, xlabel, ylabel from scipy.io import loadmat from sklearn import model_selection, tree -import numpy as np + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine2.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) @@ -19,31 +22,33 @@ tc = np.arange(2, 21, 1) # Simple holdout-set crossvalidation test_proportion = 0.5 -X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=test_proportion) +X_train, X_test, y_train, y_test = model_selection.train_test_split( + X, y, test_size=test_proportion +) # Initialize variables -Error_train = np.empty((len(tc),1)) -Error_test = np.empty((len(tc),1)) +Error_train = np.empty((len(tc), 1)) +Error_test = np.empty((len(tc), 1)) for i, t in enumerate(tc): # Fit decision tree classifier, Gini split criterion, different pruning levels - dtc = tree.DecisionTreeClassifier(criterion='gini', max_depth=t) - dtc = dtc.fit(X_train,y_train) + dtc = tree.DecisionTreeClassifier(criterion="gini", max_depth=t) + dtc = dtc.fit(X_train, y_train) # Evaluate classifier's misclassification rate over train/test data - y_est_test = np.asarray(dtc.predict(X_test),dtype=int) + y_est_test = np.asarray(dtc.predict(X_test), dtype=int) y_est_train = np.asarray(dtc.predict(X_train), dtype=int) misclass_rate_test = sum(y_est_test != y_test) / float(len(y_est_test)) misclass_rate_train = sum(y_est_train != y_train) / float(len(y_est_train)) Error_test[i], Error_train[i] = misclass_rate_test, misclass_rate_train - + f = figure() -plot(tc, Error_train*100) -plot(tc, Error_test*100) -xlabel('Model complexity (max tree depth)') -ylabel('Error (%)') -legend(['Error_train','Error_test']) - -show() - -print('Ran Exercise 6.1.1') \ No newline at end of file +plot(tc, Error_train * 100) +plot(tc, Error_test * 100) +xlabel("Model complexity (max tree depth)") +ylabel("Error (%)") +legend(["Error_train", "Error_test"]) + +show() + +print("Ran Exercise 6.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex6_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex6_1_2.py index 68ae488ab49ccceb0c86a4c85724d0b6401c99cb..f5250f775e6d12d621a14dd0150913c63cc4d11a 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex6_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex6_1_2.py @@ -1,16 +1,18 @@ # exercise 6.1.2 -from matplotlib.pyplot import figure, plot, xlabel, ylabel, legend, show, boxplot +import importlib_resources +import numpy as np +from matplotlib.pyplot import boxplot, figure, legend, plot, show, xlabel, ylabel from scipy.io import loadmat from sklearn import model_selection, tree -import numpy as np +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine2.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) @@ -19,45 +21,45 @@ tc = np.arange(2, 21, 1) # K-fold crossvalidation K = 10 -CV = model_selection.KFold(n_splits=K,shuffle=True) +CV = model_selection.KFold(n_splits=K, shuffle=True) # Initialize variable -Error_train = np.empty((len(tc),K)) -Error_test = np.empty((len(tc),K)) +Error_train = np.empty((len(tc), K)) +Error_test = np.empty((len(tc), K)) -k=0 +k = 0 for train_index, test_index in CV.split(X): - print('Computing CV fold: {0}/{1}..'.format(k+1,K)) + print("Computing CV fold: {0}/{1}..".format(k + 1, K)) # extract training and test set for current CV fold - X_train, y_train = X[train_index,:], y[train_index] - X_test, y_test = X[test_index,:], y[test_index] + X_train, y_train = X[train_index, :], y[train_index] + X_test, y_test = X[test_index, :], y[test_index] for i, t in enumerate(tc): # Fit decision tree classifier, Gini split criterion, different pruning levels - dtc = tree.DecisionTreeClassifier(criterion='gini', max_depth=t) - dtc = dtc.fit(X_train,y_train.ravel()) + dtc = tree.DecisionTreeClassifier(criterion="gini", max_depth=t) + dtc = dtc.fit(X_train, y_train.ravel()) y_est_test = dtc.predict(X_test) y_est_train = dtc.predict(X_train) # Evaluate misclassification rate over train/test data (in this CV fold) misclass_rate_test = np.sum(y_est_test != y_test) / float(len(y_est_test)) misclass_rate_train = np.sum(y_est_train != y_train) / float(len(y_est_train)) - Error_test[i,k], Error_train[i,k] = misclass_rate_test, misclass_rate_train - k+=1 + Error_test[i, k], Error_train[i, k] = misclass_rate_test, misclass_rate_train + k += 1 + - f = figure() boxplot(Error_test.T) -xlabel('Model complexity (max tree depth)') -ylabel('Test error across CV folds, K={0})'.format(K)) +xlabel("Model complexity (max tree depth)") +ylabel("Test error across CV folds, K={0})".format(K)) f = figure() plot(tc, Error_train.mean(1)) plot(tc, Error_test.mean(1)) -xlabel('Model complexity (max tree depth)') -ylabel('Error (misclassification rate, CV K={0})'.format(K)) -legend(['Error_train','Error_test']) - +xlabel("Model complexity (max tree depth)") +ylabel("Error (misclassification rate, CV K={0})".format(K)) +legend(["Error_train", "Error_test"]) + show() -print('Ran Exercise 6.1.2') \ No newline at end of file +print("Ran Exercise 6.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex6_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex6_2_1.py index 680cc65b6a4b8bb52f1b443a05d2b80f14045fdd..2be950077126609cf895cae0dbacbc49a8be79a0 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex6_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex6_2_1.py @@ -1,128 +1,171 @@ # exercise 6.2.1 -from matplotlib.pyplot import figure, plot, subplot, title, xlabel, ylabel, show, clim -from scipy.io import loadmat +import importlib_resources +import numpy as np import sklearn.linear_model as lm +from matplotlib.pyplot import clim, figure, plot, show, subplot, title, xlabel, ylabel +from scipy.io import loadmat from sklearn import model_selection -from toolbox_02450 import feature_selector_lr, bmplot -import numpy as np +from dtuimldmtools import bmplot, feature_selector_lr + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/body.mat") # Load data from matlab file -mat_data = loadmat('../Data/body.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] N, M = X.shape ## Crossvalidation # Create crossvalidation partition for evaluation K = 5 -CV = model_selection.KFold(n_splits=K,shuffle=True) +CV = model_selection.KFold(n_splits=K, shuffle=True) # Initialize variables -Features = np.zeros((M,K)) -Error_train = np.empty((K,1)) -Error_test = np.empty((K,1)) -Error_train_fs = np.empty((K,1)) -Error_test_fs = np.empty((K,1)) -Error_train_nofeatures = np.empty((K,1)) -Error_test_nofeatures = np.empty((K,1)) - -k=0 +Features = np.zeros((M, K)) +Error_train = np.empty((K, 1)) +Error_test = np.empty((K, 1)) +Error_train_fs = np.empty((K, 1)) +Error_test_fs = np.empty((K, 1)) +Error_train_nofeatures = np.empty((K, 1)) +Error_test_nofeatures = np.empty((K, 1)) + +k = 0 for train_index, test_index in CV.split(X): - # extract training and test set for current CV fold - X_train = X[train_index,:] + X_train = X[train_index, :] y_train = y[train_index] - X_test = X[test_index,:] + X_test = X[test_index, :] y_test = y[test_index] internal_cross_validation = 10 - + # Compute squared error without using the input data at all - Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0] - Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0] + Error_train_nofeatures[k] = ( + np.square(y_train - y_train.mean()).sum() / y_train.shape[0] + ) + Error_test_nofeatures[k] = np.square(y_test - y_test.mean()).sum() / y_test.shape[0] # Compute squared error with all features selected (no feature selection) m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train) - Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0] - Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0] + Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0] + Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0] # Compute squared error with feature subset selection - textout = '' - selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation,display=textout) - - Features[selected_features,k] = 1 + textout = "" + selected_features, features_record, loss_record = feature_selector_lr( + X_train, y_train, internal_cross_validation, display=textout + ) + + Features[selected_features, k] = 1 # .. alternatively you could use module sklearn.feature_selection if len(selected_features) == 0: - print('No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) + print( + "No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y)." + ) else: - m = lm.LinearRegression(fit_intercept=True).fit(X_train[:,selected_features], y_train) - Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0] - Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0] - + m = lm.LinearRegression(fit_intercept=True).fit( + X_train[:, selected_features], y_train + ) + Error_train_fs[k] = ( + np.square(y_train - m.predict(X_train[:, selected_features])).sum() + / y_train.shape[0] + ) + Error_test_fs[k] = ( + np.square(y_test - m.predict(X_test[:, selected_features])).sum() + / y_test.shape[0] + ) + figure(k) - subplot(1,2,1) - plot(range(1,len(loss_record)), loss_record[1:]) - xlabel('Iteration') - ylabel('Squared error (crossvalidation)') - - subplot(1,3,3) - bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:]) - clim(-1.5,0) - xlabel('Iteration') + subplot(1, 2, 1) + plot(range(1, len(loss_record)), loss_record[1:]) + xlabel("Iteration") + ylabel("Squared error (crossvalidation)") + + subplot(1, 3, 3) + bmplot( + attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:] + ) + clim(-1.5, 0) + xlabel("Iteration") - print('Cross validation fold {0}/{1}'.format(k+1,K)) - print('Train indices: {0}'.format(train_index)) - print('Test indices: {0}'.format(test_index)) - print('Features no: {0}\n'.format(selected_features.size)) + print("Cross validation fold {0}/{1}".format(k + 1, K)) + print("Train indices: {0}".format(train_index)) + print("Test indices: {0}".format(test_index)) + print("Features no: {0}\n".format(selected_features.size)) - k+=1 + k += 1 # Display results -print('\n') -print('Linear regression without feature selection:\n') -print('- Training error: {0}'.format(Error_train.mean())) -print('- Test error: {0}'.format(Error_test.mean())) -print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum())) -print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum())) -print('Linear regression with feature selection:\n') -print('- Training error: {0}'.format(Error_train_fs.mean())) -print('- Test error: {0}'.format(Error_test_fs.mean())) -print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum())) -print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum())) +print("\n") +print("Linear regression without feature selection:\n") +print("- Training error: {0}".format(Error_train.mean())) +print("- Test error: {0}".format(Error_test.mean())) +print( + "- R^2 train: {0}".format( + (Error_train_nofeatures.sum() - Error_train.sum()) + / Error_train_nofeatures.sum() + ) +) +print( + "- R^2 test: {0}".format( + (Error_test_nofeatures.sum() - Error_test.sum()) / Error_test_nofeatures.sum() + ) +) +print("Linear regression with feature selection:\n") +print("- Training error: {0}".format(Error_train_fs.mean())) +print("- Test error: {0}".format(Error_test_fs.mean())) +print( + "- R^2 train: {0}".format( + (Error_train_nofeatures.sum() - Error_train_fs.sum()) + / Error_train_nofeatures.sum() + ) +) +print( + "- R^2 test: {0}".format( + (Error_test_nofeatures.sum() - Error_test_fs.sum()) + / Error_test_nofeatures.sum() + ) +) figure(k) -subplot(1,3,2) -bmplot(attributeNames, range(1,Features.shape[1]+1), -Features) -clim(-1.5,0) -xlabel('Crossvalidation fold') -ylabel('Attribute') +subplot(1, 3, 2) +bmplot(attributeNames, range(1, Features.shape[1] + 1), -Features) +clim(-1.5, 0) +xlabel("Crossvalidation fold") +ylabel("Attribute") # Inspect selected feature coefficients effect on the entire dataset and # plot the fitted model residual error as function of each attribute to # inspect for systematic structure in the residual -f=2 # cross-validation fold to inspect -ff=Features[:,f-1].nonzero()[0] +f = 2 # cross-validation fold to inspect +ff = Features[:, f - 1].nonzero()[0] if len(ff) == 0: - print('\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) + print( + "\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y)." + ) else: - m = lm.LinearRegression(fit_intercept=True).fit(X[:,ff], y) - - y_est= m.predict(X[:,ff]) - residual=y-y_est - - figure(k+1, figsize=(12,6)) - title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f)) - for i in range(0,len(ff)): - subplot(2, int( np.ceil(len(ff)/2)), i+1) - plot(X[:,ff[i]],residual,'.') - xlabel(attributeNames[ff[i]]) - ylabel('residual error') - - + m = lm.LinearRegression(fit_intercept=True).fit(X[:, ff], y) + + y_est = m.predict(X[:, ff]) + residual = y - y_est + + figure(k + 1, figsize=(12, 6)) + title( + "Residual error vs. Attributes for features selected in cross-validation fold {0}".format( + f + ) + ) + for i in range(0, len(ff)): + subplot(2, int(np.ceil(len(ff) / 2)), i + 1) + plot(X[:, ff[i]], residual, ".") + xlabel(attributeNames[ff[i]]) + ylabel("residual error") + + show() -print('Ran Exercise 6.2.1') \ No newline at end of file +print("Ran Exercise 6.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex6_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex6_3_1.py index 1f5bc97af4cabea37373697ba9779623d3e08831..ced5d6063730861eacc33c25a87429565dec69a2 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex6_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex6_3_1.py @@ -1,78 +1,96 @@ # exercise 6.3.1 -from matplotlib.pyplot import (figure, plot, title, xlabel, ylabel, - colorbar, imshow, xticks, yticks, show) +import importlib_resources +from matplotlib.pyplot import ( + colorbar, + figure, + imshow, + plot, + show, + title, + xlabel, + xticks, + ylabel, + yticks, +) from scipy.io import loadmat -from sklearn.neighbors import KNeighborsClassifier, DistanceMetric from sklearn.metrics import confusion_matrix -from numpy import cov -import scipy +from sklearn.neighbors import KNeighborsClassifier + +filename = importlib_resources.files("dtuimldmtools").joinpath( + "synth1.mat" +) # <-- change the number to change dataset # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth3.mat') # <-- change the number to change dataset) -X = mat_data['X'] -X_train = mat_data['X_train'] -X_test = mat_data['X_test'] -y = mat_data['y'].squeeze() -y_train = mat_data['y_train'].squeeze() -y_test = mat_data['y_test'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +X_train = mat_data["X_train"] +X_test = mat_data["X_test"] +y = mat_data["y"].squeeze() +y_train = mat_data["y_train"].squeeze() +y_test = mat_data["y_test"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) # Plot the training data points (color-coded) and test data points. figure(1) -styles = ['.b', '.r', '.g', '.y'] +styles = [".b", ".r", ".g", ".y"] for c in range(C): - class_mask = (y_train==c) - plot(X_train[class_mask,0], X_train[class_mask,1], styles[c]) + class_mask = y_train == c + plot(X_train[class_mask, 0], X_train[class_mask, 1], styles[c]) # K-nearest neighbors -K=5 +K = 5 # Distance metric (corresponds to 2nd norm, euclidean distance). # You can set dist=1 to obtain manhattan distance (cityblock distance). -dist=2 -metric = 'minkowski' -metric_params = {} # no parameters needed for minkowski +dist = 2 +metric = "minkowski" +metric_params = {} # no parameters needed for minkowski # You can set the metric argument to 'cosine' to determine the cosine distance -#metric = 'cosine' -#metric_params = {} # no parameters needed for cosine +# metric = 'cosine' +# metric_params = {} # no parameters needed for cosine # To use a mahalonobis distance, we need to input the covariance matrix, too: -#metric='mahalanobis' -#metric_params={'V': cov(X_train, rowvar=False)} +# metric='mahalanobis' +# metric_params={'V': cov(X_train, rowvar=False)} # Fit classifier and classify the test points -knclassifier = KNeighborsClassifier(n_neighbors=K, p=dist, - metric=metric, - metric_params=metric_params) +knclassifier = KNeighborsClassifier( + n_neighbors=K, p=dist, metric=metric, metric_params=metric_params +) knclassifier.fit(X_train, y_train) y_est = knclassifier.predict(X_test) # Plot the classfication results -styles = ['ob', 'or', 'og', 'oy'] +styles = ["ob", "or", "og", "oy"] for c in range(C): - class_mask = (y_est==c) - plot(X_test[class_mask,0], X_test[class_mask,1], styles[c], markersize=10) - plot(X_test[class_mask,0], X_test[class_mask,1], 'kx', markersize=8) -title('Synthetic data classification - KNN'); + class_mask = y_est == c + plot(X_test[class_mask, 0], X_test[class_mask, 1], styles[c], markersize=10) + plot(X_test[class_mask, 0], X_test[class_mask, 1], "kx", markersize=8) +title("Synthetic data classification - KNN") # Compute and plot confusion matrix -cm = confusion_matrix(y_test, y_est); -accuracy = 100*cm.diagonal().sum()/cm.sum(); error_rate = 100-accuracy; -figure(2); -imshow(cm, cmap='binary', interpolation='None'); +cm = confusion_matrix(y_test, y_est) +accuracy = 100 * cm.diagonal().sum() / cm.sum() +error_rate = 100 - accuracy +figure(2) +imshow(cm, cmap="binary", interpolation="None") colorbar() -xticks(range(C)); yticks(range(C)); -xlabel('Predicted class'); ylabel('Actual class'); -title('Confusion matrix (Accuracy: {0}%, Error Rate: {1}%)'.format(accuracy, error_rate)); +xticks(range(C)) +yticks(range(C)) +xlabel("Predicted class") +ylabel("Actual class") +title( + "Confusion matrix (Accuracy: {0}%, Error Rate: {1}%)".format(accuracy, error_rate) +) show() -print('Ran Exercise 6.3.1') \ No newline at end of file +print("Ran Exercise 6.3.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex6_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex6_3_2.py index c86ae1a2d55a0dbd3d54bb5d9a96e89757e9e928..2dda8a92f1187a9f45cd7e8d64a21a83fd0c12a4 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex6_3_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex6_3_2.py @@ -1,43 +1,43 @@ # exercise 6.3.2 -from matplotlib.pyplot import figure, plot, xlabel, ylabel, show import numpy as np -from scipy.io import loadmat -from sklearn.neighbors import KNeighborsClassifier -from sklearn import model_selection # requires data from exercise 1.5.1 from ex1_5_1 import * +from matplotlib.pyplot import figure, plot, show, xlabel, ylabel +from scipy.io import loadmat +from sklearn import model_selection +from sklearn.neighbors import KNeighborsClassifier # Maximum number of neighbors -L=40 +L = 40 CV = model_selection.LeaveOneOut() -errors = np.zeros((N,L)) -i=0 +errors = np.zeros((N, L)) +i = 0 for train_index, test_index in CV.split(X, y): - print('Crossvalidation fold: {0}/{1}'.format(i+1,N)) - + print("Crossvalidation fold: {0}/{1}".format(i + 1, N)) + # extract training and test set for current CV fold - X_train = X[train_index,:] + X_train = X[train_index, :] y_train = y[train_index] - X_test = X[test_index,:] + X_test = X[test_index, :] y_test = y[test_index] # Fit classifier and classify the test points (consider 1 to 40 neighbors) - for l in range(1,L+1): - knclassifier = KNeighborsClassifier(n_neighbors=l); - knclassifier.fit(X_train, y_train); - y_est = knclassifier.predict(X_test); - errors[i,l-1] = np.sum(y_est[0]!=y_test[0]) - - i+=1 - + for l in range(1, L + 1): + knclassifier = KNeighborsClassifier(n_neighbors=l) + knclassifier.fit(X_train, y_train) + y_est = knclassifier.predict(X_test) + errors[i, l - 1] = np.sum(y_est[0] != y_test[0]) + + i += 1 + # Plot the classification error rate figure() -plot(100*sum(errors,0)/N) -xlabel('Number of neighbors') -ylabel('Classification error rate (%)') +plot(100 * sum(errors, 0) / N) +xlabel("Number of neighbors") +ylabel("Classification error rate (%)") show() -print('Ran Exercise 6.3.2') \ No newline at end of file +print("Ran Exercise 6.3.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex7_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex7_1_1.py index f17d8655fa4a2371194d68ab46fc0f0648929244..ff2ad8158085486b6823fe4219c8bc6f372d21db 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex7_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex7_1_1.py @@ -1,30 +1,30 @@ -from matplotlib.pyplot import figure, plot, xlabel, ylabel, show import numpy as np -from scipy.io import loadmat -from sklearn.neighbors import KNeighborsClassifier -from sklearn import model_selection # requires data from exercise 1.5.1 from ex1_5_1 import * +from matplotlib.pyplot import figure, plot, show, xlabel, ylabel +from scipy.io import loadmat +from sklearn import model_selection +from sklearn.neighbors import KNeighborsClassifier # This script crates predictions from three KNN classifiers using cross-validation # Maximum number of neighbors -L=[1, 20, 80] +L = [1, 20, 80] CV = model_selection.LeaveOneOut() -i=0 +i = 0 # store predictions. yhat = [] y_true = [] for train_index, test_index in CV.split(X, y): - print('Crossvalidation fold: {0}/{1}'.format(i+1,N)) - + print("Crossvalidation fold: {0}/{1}".format(i + 1, N)) + # extract training and test set for current CV fold - X_train = X[train_index,:] + X_train = X[train_index, :] y_train = y[train_index] - X_test = X[test_index,:] + X_test = X[test_index, :] y_test = y[test_index] # Fit classifier and classify the test points (consider 1 to 40 neighbors) @@ -34,14 +34,14 @@ for train_index, test_index in CV.split(X, y): knclassifier.fit(X_train, y_train) y_est = knclassifier.predict(X_test) - dy.append( y_est ) + dy.append(y_est) # errors[i,l-1] = np.sum(y_est[0]!=y_test[0]) dy = np.stack(dy, axis=1) yhat.append(dy) y_true.append(y_test) - i+=1 + i += 1 yhat = np.concatenate(yhat) y_true = np.concatenate(y_true) -yhat[:,0] # predictions made by first classifier. +yhat[:, 0] # predictions made by first classifier. # Compute accuracy here. diff --git a/exercises/02450Toolbox_Python/Scripts/ex7_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex7_1_2.py index acab68070859d7157fdd48ea8bb11b805f09951e..75643903afb52674a4826edb23a99c2ac092b03a 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex7_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex7_1_2.py @@ -1,8 +1,9 @@ -from toolbox_02450 import jeffrey_interval from ex7_1_1 import * +from dtuimldmtools import jeffrey_interval + # Compute the Jeffreys interval alpha = 0.05 -[thetahatA, CIA] = jeffrey_interval(y_true, yhat[:,0], alpha=alpha) +[thetahatA, CIA] = jeffrey_interval(y_true, yhat[:, 0], alpha=alpha) print("Theta point estimate", thetahatA, " CI: ", CIA) diff --git a/exercises/02450Toolbox_Python/Scripts/ex7_1_4.py b/exercises/02450Toolbox_Python/Scripts/ex7_1_4.py index a045b98b7ddc90ab6a9408abb495121a866577cd..ee648fe6468f7f1818ca5e6e25afc0cdd0ecfa3b 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex7_1_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex7_1_4.py @@ -1,8 +1,9 @@ -from toolbox_02450 import mcnemar from ex7_1_1 import * +from dtuimldmtools import mcnemar + # Compute the Jeffreys interval alpha = 0.05 -[thetahat, CI, p] = mcnemar(y_true, yhat[:,0], yhat[:,1], alpha=alpha) +[thetahat, CI, p] = mcnemar(y_true, yhat[:, 0], yhat[:, 1], alpha=alpha) print("theta = theta_A-theta_B point estimate", thetahat, " CI: ", CI, "p-value", p) diff --git a/exercises/02450Toolbox_Python/Scripts/ex7_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex7_2_1.py index 7fe4cd1fc8c1b05870c4e7409c2f8437edcf4d49..4dd2032fddc1ea1734465ef997a53ba4e5e4ac76 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex7_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex7_2_1.py @@ -1,37 +1,43 @@ -from matplotlib.pyplot import figure, plot, xlabel, ylabel, show import numpy as np -from sklearn.neighbors import KNeighborsClassifier -from sklearn import model_selection -import sklearn.tree import scipy.stats -import numpy as np, scipy.stats as st +import scipy.stats as st +import sklearn.tree # requires data from exercise 1.5.1 from ex5_1_5 import * +from matplotlib.pyplot import figure, plot, show, xlabel, ylabel +from sklearn import model_selection +from sklearn.neighbors import KNeighborsClassifier -X,y = X[:,:10], X[:,10:] +X, y = X[:, :10], X[:, 10:] # This script crates predictions from three KNN classifiers using cross-validation test_proportion = 0.2 -X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=test_proportion) +X_train, X_test, y_train, y_test = model_selection.train_test_split( + X, y, test_size=test_proportion +) -mA = sklearn.linear_model.LinearRegression().fit(X_train,y_train) +mA = sklearn.linear_model.LinearRegression().fit(X_train, y_train) mB = sklearn.tree.DecisionTreeRegressor().fit(X_train, y_train) yhatA = mA.predict(X_test) -yhatB = mB.predict(X_test)[:,np.newaxis] # justsklearnthings +yhatB = mB.predict(X_test)[:, np.newaxis] # justsklearnthings # perform statistical comparison of the models # compute z with squared error. -zA = np.abs(y_test - yhatA ) ** 2 +zA = np.abs(y_test - yhatA) ** 2 # compute confidence interval of model A alpha = 0.05 -CIA = st.t.interval(1-alpha, df=len(zA)-1, loc=np.mean(zA), scale=st.sem(zA)) # Confidence interval +CIA = st.t.interval( + 1 - alpha, df=len(zA) - 1, loc=np.mean(zA), scale=st.sem(zA) +) # Confidence interval # Compute confidence interval of z = zA-zB and p-value of Null hypothesis -zB = np.abs(y_test - yhatB ) ** 2 +zB = np.abs(y_test - yhatB) ** 2 z = zA - zB -CI = st.t.interval(1-alpha, len(z)-1, loc=np.mean(z), scale=st.sem(z)) # Confidence interval -p = 2*st.t.cdf( -np.abs( np.mean(z) )/st.sem(z), df=len(z)-1) # p-value +CI = st.t.interval( + 1 - alpha, len(z) - 1, loc=np.mean(z), scale=st.sem(z) +) # Confidence interval +p = 2 * st.t.cdf(-np.abs(np.mean(z)) / st.sem(z), df=len(z) - 1) # p-value diff --git a/exercises/02450Toolbox_Python/Scripts/ex7_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex7_3_1.py index befd6439f3e510f5f0d4eabd16a4197fedb2812f..60a9781933948bef1770546e976a4307a1aaf39e 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex7_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex7_3_1.py @@ -1,16 +1,21 @@ -import sklearn.tree +import scipy.stats as st import sklearn.linear_model +import sklearn.tree -from toolbox_02450 import * # requires data from exercise 1.5.1 from ex5_1_5 import * +from sklearn import model_selection + +from dtuimldmtools import * +from dtuimldmtools.statistics.statistics import correlated_ttest loss = 2 -X,y = X[:,:10], X[:,10:] +X, y = X[:, :10], X[:, 10:] # This script crates predictions from three KNN classifiers using cross-validation -K = 10 # We presently set J=K +K = 10 m = 1 +J = 0 r = [] kf = model_selection.KFold(n_splits=K) @@ -19,7 +24,7 @@ for dm in range(m): yhat = [] for train_index, test_index in kf.split(X): - X_train, y_train = X[train_index,:], y[train_index] + X_train, y_train = X[train_index, :], y[train_index] X_test, y_test = X[test_index, :], y[test_index] mA = sklearn.linear_model.LinearRegression().fit(X_train, y_train) @@ -28,26 +33,30 @@ for dm in range(m): yhatA = mA.predict(X_test) yhatB = mB.predict(X_test)[:, np.newaxis] # justsklearnthings y_true.append(y_test) - yhat.append( np.concatenate([yhatA, yhatB], axis=1) ) + yhat.append(np.concatenate([yhatA, yhatB], axis=1)) - r.append( np.mean( np.abs( yhatA-y_test ) ** loss - np.abs( yhatB-y_test) ** loss ) ) + r.append( + np.mean(np.abs(yhatA - y_test) ** loss - np.abs(yhatB - y_test) ** loss) + ) # Initialize parameters and run test appropriate for setup II alpha = 0.05 -rho = 1/K +rho = 1 / K p_setupII, CI_setupII = correlated_ttest(r, rho, alpha=alpha) if m == 1: - y_true = np.concatenate(y_true)[:,0] + y_true = np.concatenate(y_true)[:, 0] yhat = np.concatenate(yhat) # note our usual setup I ttest only makes sense if m=1. - zA = np.abs(y_true - yhat[:,0] ) ** loss - zB = np.abs(y_true - yhat[:,1] ) ** loss + zA = np.abs(y_true - yhat[:, 0]) ** loss + zB = np.abs(y_true - yhat[:, 1]) ** loss z = zA - zB - CI_setupI = st.t.interval(1 - alpha, len(z) - 1, loc=np.mean(z), scale=st.sem(z)) # Confidence interval + CI_setupI = st.t.interval( + 1 - alpha, len(z) - 1, loc=np.mean(z), scale=st.sem(z) + ) # Confidence interval p_setupI = st.t.cdf(-np.abs(np.mean(z)) / st.sem(z), df=len(z) - 1) # p-value - print( [p_setupII, p_setupI] ) - print(CI_setupII, CI_setupI ) + print([p_setupII, p_setupI]) + print(CI_setupII, CI_setupI) diff --git a/exercises/02450Toolbox_Python/Scripts/ex7_4_3.py b/exercises/02450Toolbox_Python/Scripts/ex7_4_3.py index 2f7da933c0f8a0c457543f49733230bde21b1dc6..eb329bd477c6467c1b476d5d1b9cff674b779645 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex7_4_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex7_4_3.py @@ -1,28 +1,41 @@ # exercise 7.4.3 +import importlib_resources import numpy as np # Load list of names from files -fmale = open('../Data/male.txt','r') -ffemale = open('../Data/female.txt','r') -mnames = fmale.readlines(); fnames = ffemale.readlines(); + +fmale = open(importlib_resources.files("dtuimldmtools").joinpath("data/male.txt"), "r") +ffemale = open( + importlib_resources.files("dtuimldmtools").joinpath("data/female.txt"), "r" +) +mnames = fmale.readlines() +fnames = ffemale.readlines() names = mnames + fnames -gender = [0]*len(mnames) + [1]*len(fnames) -fmale.close(); ffemale.close(); +gender = [0] * len(mnames) + [1] * len(fnames) +fmale.close() +ffemale.close() # Extract X, y and the rest of variables. Include only names of >4 characters. -X = np.zeros((len(names),4)) -y = np.zeros((len(names),1)) -n=0 -for i in range(0,len(names)): +X = np.zeros((len(names), 4)) +y = np.zeros((len(names), 1)) +n = 0 +for i in range(0, len(names)): name = names[i].strip().lower() - if len(name)>3: - X[n,:] = [ord(name[0])-ord('a')+1, ord(name[1])-ord('a')+1, ord(name[-2])-ord('a')+1, ord(name[-1])-ord('a')+1] - y[n,0] = gender[i] - n+=1 -X = X[0:n,:]; y = y[0:n,:]; + if len(name) > 3: + X[n, :] = [ + ord(name[0]) - ord("a") + 1, + ord(name[1]) - ord("a") + 1, + ord(name[-2]) - ord("a") + 1, + ord(name[-1]) - ord("a") + 1, + ] + y[n, 0] = gender[i] + n += 1 +X = X[0:n, :] +y = y[0:n, :] -N, M = X.shape; C = 2 -attributeNames = ['1st letter', '2nd letter', 'Next-to-last letter', 'Last letter'] -classNames = ['Female', 'Male']; +N, M = X.shape +C = 2 +attributeNames = ["1st letter", "2nd letter", "Next-to-last letter", "Last letter"] +classNames = ["Female", "Male"] -print('Ran Exercise 7.2.3') \ No newline at end of file +print("Ran Exercise 7.2.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex7_4_4.py b/exercises/02450Toolbox_Python/Scripts/ex7_4_4.py index 82d903db6eb6b109cf0419359fea60c50c1c4288..4af83e65de285ef26f990e04c2f5bb41d6ea21cd 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex7_4_4.py +++ b/exercises/02450Toolbox_Python/Scripts/ex7_4_4.py @@ -1,21 +1,22 @@ # exercise 7.4.4 -from sklearn.naive_bayes import MultinomialNB +import numpy as np +from ex7_4_3 import * from sklearn import model_selection +from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import OneHotEncoder -from ex7_4_3 import * np.random.seed(2450) y = y.squeeze() 0 # Naive Bayes classifier parameters -alpha = 1.0 # pseudo-count, additive parameter (Laplace correction if 1.0 or Lidtstone smoothing otherwise) -fit_prior = True # uniform prior (change to True to estimate prior from data) +alpha = 1.0 # pseudo-count, additive parameter (Laplace correction if 1.0 or Lidtstone smoothing otherwise) +fit_prior = True # uniform prior (change to True to estimate prior from data) # K-fold crossvalidation K = 10 -CV = model_selection.KFold(n_splits=K,shuffle=True) +CV = model_selection.KFold(n_splits=K, shuffle=True) -X = X[:,0:4] # using all 4 letters, +X = X[:, 0:4] # using all 4 letters, # for using e.g. only third letter or first and last try X[:,[2]] and X[:, [0,3]] # We need to specify that the data is categorical. @@ -30,26 +31,25 @@ X = X[:,0:4] # using all 4 letters, X = OneHotEncoder().fit_transform(X=X) errors = np.zeros(K) -k=0 +k = 0 for train_index, test_index in CV.split(X): - #print('Crossvalidation fold: {0}/{1}'.format(k+1,K)) + # print('Crossvalidation fold: {0}/{1}'.format(k+1,K)) # extract training and test set for current CV fold - X_train = X[train_index,:] + X_train = X[train_index, :] y_train = y[train_index] - X_test = X[test_index,:] + X_test = X[test_index, :] y_test = y[test_index] - nb_classifier = MultinomialNB(alpha=alpha, - fit_prior=fit_prior) + nb_classifier = MultinomialNB(alpha=alpha, fit_prior=fit_prior) nb_classifier.fit(X_train, y_train) y_est_prob = nb_classifier.predict_proba(X_test) - y_est = np.argmax(y_est_prob,1) + y_est = np.argmax(y_est_prob, 1) - errors[k] = np.sum(y_est!=y_test,dtype=float)/y_test.shape[0] - k+=1 + errors[k] = np.sum(y_est != y_test, dtype=float) / y_test.shape[0] + k += 1 # Plot the classification error rate -print('Error rate: {0}%'.format(100*np.mean(errors))) +print("Error rate: {0}%".format(100 * np.mean(errors))) -print('Ran Exercise 7.2.4') +print("Ran Exercise 7.2.4") diff --git a/exercises/02450Toolbox_Python/Scripts/ex8_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex8_1_1.py index 30326f56a49472db96d7a79511068764d56bb4db..db4639dec1eb28c3485c874c408601ae8355eefd 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex8_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex8_1_1.py @@ -1,134 +1,187 @@ # exercise 8.1.1 -from matplotlib.pylab import (figure, semilogx, loglog, xlabel, ylabel, legend, - title, subplot, show, grid) +import importlib_resources import numpy as np -from scipy.io import loadmat import sklearn.linear_model as lm +from matplotlib.pylab import ( + figure, + grid, + legend, + loglog, + semilogx, + show, + subplot, + title, + xlabel, + ylabel, +) +from scipy.io import loadmat from sklearn import model_selection -from toolbox_02450 import rlr_validate -mat_data = loadmat('../Data/body.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] +from dtuimldmtools import rlr_validate + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/body.mat") + + +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] N, M = X.shape # Add offset attribute -X = np.concatenate((np.ones((X.shape[0],1)),X),1) -attributeNames = [u'Offset']+attributeNames -M = M+1 +X = np.concatenate((np.ones((X.shape[0], 1)), X), 1) +attributeNames = ["Offset"] + attributeNames +M = M + 1 ## Crossvalidation # Create crossvalidation partition for evaluation K = 5 CV = model_selection.KFold(K, shuffle=True) -#CV = model_selection.KFold(K, shuffle=False) +# CV = model_selection.KFold(K, shuffle=False) # Values of lambda -lambdas = np.power(10.,range(-5,9)) +lambdas = np.power(10.0, range(-5, 9)) # Initialize variables -#T = len(lambdas) -Error_train = np.empty((K,1)) -Error_test = np.empty((K,1)) -Error_train_rlr = np.empty((K,1)) -Error_test_rlr = np.empty((K,1)) -Error_train_nofeatures = np.empty((K,1)) -Error_test_nofeatures = np.empty((K,1)) -w_rlr = np.empty((M,K)) -mu = np.empty((K, M-1)) -sigma = np.empty((K, M-1)) -w_noreg = np.empty((M,K)) - -k=0 -for train_index, test_index in CV.split(X,y): - +# T = len(lambdas) +Error_train = np.empty((K, 1)) +Error_test = np.empty((K, 1)) +Error_train_rlr = np.empty((K, 1)) +Error_test_rlr = np.empty((K, 1)) +Error_train_nofeatures = np.empty((K, 1)) +Error_test_nofeatures = np.empty((K, 1)) +w_rlr = np.empty((M, K)) +mu = np.empty((K, M - 1)) +sigma = np.empty((K, M - 1)) +w_noreg = np.empty((M, K)) + +k = 0 +for train_index, test_index in CV.split(X, y): # extract training and test set for current CV fold X_train = X[train_index] y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] - internal_cross_validation = 10 - - opt_val_err, opt_lambda, mean_w_vs_lambda, train_err_vs_lambda, test_err_vs_lambda = rlr_validate(X_train, y_train, lambdas, internal_cross_validation) + internal_cross_validation = 10 + + ( + opt_val_err, + opt_lambda, + mean_w_vs_lambda, + train_err_vs_lambda, + test_err_vs_lambda, + ) = rlr_validate(X_train, y_train, lambdas, internal_cross_validation) # Standardize outer fold based on training set, and save the mean and standard # deviations since they're part of the model (they would be needed for # making new predictions) - for brevity we won't always store these in the scripts mu[k, :] = np.mean(X_train[:, 1:], 0) sigma[k, :] = np.std(X_train[:, 1:], 0) - - X_train[:, 1:] = (X_train[:, 1:] - mu[k, :] ) / sigma[k, :] - X_test[:, 1:] = (X_test[:, 1:] - mu[k, :] ) / sigma[k, :] - + + X_train[:, 1:] = (X_train[:, 1:] - mu[k, :]) / sigma[k, :] + X_test[:, 1:] = (X_test[:, 1:] - mu[k, :]) / sigma[k, :] + Xty = X_train.T @ y_train XtX = X_train.T @ X_train - + # Compute mean squared error without using the input data at all - Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum(axis=0)/y_train.shape[0] - Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum(axis=0)/y_test.shape[0] + Error_train_nofeatures[k] = ( + np.square(y_train - y_train.mean()).sum(axis=0) / y_train.shape[0] + ) + Error_test_nofeatures[k] = ( + np.square(y_test - y_test.mean()).sum(axis=0) / y_test.shape[0] + ) # Estimate weights for the optimal value of lambda, on entire training set lambdaI = opt_lambda * np.eye(M) - lambdaI[0,0] = 0 # Do no regularize the bias term - w_rlr[:,k] = np.linalg.solve(XtX+lambdaI,Xty).squeeze() + lambdaI[0, 0] = 0 # Do no regularize the bias term + w_rlr[:, k] = np.linalg.solve(XtX + lambdaI, Xty).squeeze() # Compute mean squared error with regularization with optimal lambda - Error_train_rlr[k] = np.square(y_train-X_train @ w_rlr[:,k]).sum(axis=0)/y_train.shape[0] - Error_test_rlr[k] = np.square(y_test-X_test @ w_rlr[:,k]).sum(axis=0)/y_test.shape[0] + Error_train_rlr[k] = ( + np.square(y_train - X_train @ w_rlr[:, k]).sum(axis=0) / y_train.shape[0] + ) + Error_test_rlr[k] = ( + np.square(y_test - X_test @ w_rlr[:, k]).sum(axis=0) / y_test.shape[0] + ) # Estimate weights for unregularized linear regression, on entire training set - w_noreg[:,k] = np.linalg.solve(XtX,Xty).squeeze() + w_noreg[:, k] = np.linalg.solve(XtX, Xty).squeeze() # Compute mean squared error without regularization - Error_train[k] = np.square(y_train-X_train @ w_noreg[:,k]).sum(axis=0)/y_train.shape[0] - Error_test[k] = np.square(y_test-X_test @ w_noreg[:,k]).sum(axis=0)/y_test.shape[0] + Error_train[k] = ( + np.square(y_train - X_train @ w_noreg[:, k]).sum(axis=0) / y_train.shape[0] + ) + Error_test[k] = ( + np.square(y_test - X_test @ w_noreg[:, k]).sum(axis=0) / y_test.shape[0] + ) # OR ALTERNATIVELY: you can use sklearn.linear_model module for linear regression: - #m = lm.LinearRegression().fit(X_train, y_train) - #Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0] - #Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0] + # m = lm.LinearRegression().fit(X_train, y_train) + # Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0] + # Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0] # Display the results for the last cross-validation fold - if k == K-1: - figure(k, figsize=(12,8)) - subplot(1,2,1) - semilogx(lambdas,mean_w_vs_lambda.T[:,1:],'.-') # Don't plot the bias term - xlabel('Regularization factor') - ylabel('Mean Coefficient Values') + if k == K - 1: + figure(k, figsize=(12, 8)) + subplot(1, 2, 1) + semilogx(lambdas, mean_w_vs_lambda.T[:, 1:], ".-") # Don't plot the bias term + xlabel("Regularization factor") + ylabel("Mean Coefficient Values") grid() - # You can choose to display the legend, but it's omitted for a cleaner + # You can choose to display the legend, but it's omitted for a cleaner # plot, since there are many attributes - #legend(attributeNames[1:], loc='best') - - subplot(1,2,2) - title('Optimal lambda: 1e{0}'.format(np.log10(opt_lambda))) - loglog(lambdas,train_err_vs_lambda.T,'b.-',lambdas,test_err_vs_lambda.T,'r.-') - xlabel('Regularization factor') - ylabel('Squared error (crossvalidation)') - legend(['Train error','Validation error']) + # legend(attributeNames[1:], loc='best') + + subplot(1, 2, 2) + title("Optimal lambda: 1e{0}".format(np.log10(opt_lambda))) + loglog( + lambdas, train_err_vs_lambda.T, "b.-", lambdas, test_err_vs_lambda.T, "r.-" + ) + xlabel("Regularization factor") + ylabel("Squared error (crossvalidation)") + legend(["Train error", "Validation error"]) grid() - + # To inspect the used indices, use these print statements - #print('Cross validation fold {0}/{1}:'.format(k+1,K)) - #print('Train indices: {0}'.format(train_index)) - #print('Test indices: {0}\n'.format(test_index)) + # print('Cross validation fold {0}/{1}:'.format(k+1,K)) + # print('Train indices: {0}'.format(train_index)) + # print('Test indices: {0}\n'.format(test_index)) - k+=1 + k += 1 show() # Display results -print('Linear regression without feature selection:') -print('- Training error: {0}'.format(Error_train.mean())) -print('- Test error: {0}'.format(Error_test.mean())) -print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum())) -print('- R^2 test: {0}\n'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum())) -print('Regularized linear regression:') -print('- Training error: {0}'.format(Error_train_rlr.mean())) -print('- Test error: {0}'.format(Error_test_rlr.mean())) -print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train_rlr.sum())/Error_train_nofeatures.sum())) -print('- R^2 test: {0}\n'.format((Error_test_nofeatures.sum()-Error_test_rlr.sum())/Error_test_nofeatures.sum())) - -print('Weights in last fold:') +print("Linear regression without feature selection:") +print("- Training error: {0}".format(Error_train.mean())) +print("- Test error: {0}".format(Error_test.mean())) +print( + "- R^2 train: {0}".format( + (Error_train_nofeatures.sum() - Error_train.sum()) + / Error_train_nofeatures.sum() + ) +) +print( + "- R^2 test: {0}\n".format( + (Error_test_nofeatures.sum() - Error_test.sum()) / Error_test_nofeatures.sum() + ) +) +print("Regularized linear regression:") +print("- Training error: {0}".format(Error_train_rlr.mean())) +print("- Test error: {0}".format(Error_test_rlr.mean())) +print( + "- R^2 train: {0}".format( + (Error_train_nofeatures.sum() - Error_train_rlr.sum()) + / Error_train_nofeatures.sum() + ) +) +print( + "- R^2 test: {0}\n".format( + (Error_test_nofeatures.sum() - Error_test_rlr.sum()) + / Error_test_nofeatures.sum() + ) +) + +print("Weights in last fold:") for m in range(M): - print('{:>15} {:>15}'.format(attributeNames[m], np.round(w_rlr[m,-1],2))) + print("{:>15} {:>15}".format(attributeNames[m], np.round(w_rlr[m, -1], 2))) -print('Ran Exercise 8.1.1') \ No newline at end of file +print("Ran Exercise 8.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex8_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex8_1_2.py index 5240f058aadb675ebf9c2c66247249f0300d169c..ff9d6d954542a6095dcde78095b4d4f05573ce70 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex8_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex8_1_2.py @@ -1,32 +1,35 @@ # exercise 8.1.2 +import importlib_resources import matplotlib.pyplot as plt import numpy as np from scipy.io import loadmat -from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression -from toolbox_02450 import rocplot, confmatplot +from sklearn.model_selection import train_test_split + +from dtuimldmtools import confmatplot, rocplot +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") font_size = 15 -plt.rcParams.update({'font.size': font_size}) +plt.rcParams.update({"font.size": font_size}) # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine2.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) # Create crossvalidation partition for evaluation -# using stratification and 95 pct. split between training and test +# using stratification and 95 pct. split between training and test K = 20 -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.95, stratify=y) -# Try to change the test_size to e.g. 50 % and 99 % - how does that change the -# effect of regularization? How does differetn runs of test_size=.99 compare +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, stratify=y) +# Try to change the test_size to e.g. 50 % and 99 % - how does that change the +# effect of regularization? How does differetn runs of test_size=.99 compare # to eachother? # Standardize the training and set set based on training set mean and std @@ -36,52 +39,59 @@ sigma = np.std(X_train, 0) X_train = (X_train - mu) / sigma X_test = (X_test - mu) / sigma -# Fit regularized logistic regression model to training data to predict +# Fit regularized logistic regression model to training data to predict # the type of wine lambda_interval = np.logspace(-8, 2, 50) train_error_rate = np.zeros(len(lambda_interval)) test_error_rate = np.zeros(len(lambda_interval)) coefficient_norm = np.zeros(len(lambda_interval)) for k in range(0, len(lambda_interval)): - mdl = LogisticRegression(penalty='l2', C=1/lambda_interval[k] ) - + mdl = LogisticRegression(penalty="l2", C=1 / lambda_interval[k]) + mdl.fit(X_train, y_train) y_train_est = mdl.predict(X_train).T y_test_est = mdl.predict(X_test).T - + train_error_rate[k] = np.sum(y_train_est != y_train) / len(y_train) test_error_rate[k] = np.sum(y_test_est != y_test) / len(y_test) - w_est = mdl.coef_[0] + w_est = mdl.coef_[0] coefficient_norm[k] = np.sqrt(np.sum(w_est**2)) min_error = np.min(test_error_rate) opt_lambda_idx = np.argmin(test_error_rate) opt_lambda = lambda_interval[opt_lambda_idx] -plt.figure(figsize=(8,8)) -#plt.plot(np.log10(lambda_interval), train_error_rate*100) -#plt.plot(np.log10(lambda_interval), test_error_rate*100) -#plt.plot(np.log10(opt_lambda), min_error*100, 'o') -plt.semilogx(lambda_interval, train_error_rate*100) -plt.semilogx(lambda_interval, test_error_rate*100) -plt.semilogx(opt_lambda, min_error*100, 'o') -plt.text(1e-8, 3, "Minimum test error: " + str(np.round(min_error*100,2)) + ' % at 1e' + str(np.round(np.log10(opt_lambda),2))) -plt.xlabel('Regularization strength, $\log_{10}(\lambda)$') -plt.ylabel('Error rate (%)') -plt.title('Classification error') -plt.legend(['Training error','Test error','Test minimum'],loc='upper right') +plt.figure(figsize=(8, 8)) +# plt.plot(np.log10(lambda_interval), train_error_rate*100) +# plt.plot(np.log10(lambda_interval), test_error_rate*100) +# plt.plot(np.log10(opt_lambda), min_error*100, 'o') +plt.semilogx(lambda_interval, train_error_rate * 100) +plt.semilogx(lambda_interval, test_error_rate * 100) +plt.semilogx(opt_lambda, min_error * 100, "o") +plt.text( + 1e-8, + 3, + "Minimum test error: " + + str(np.round(min_error * 100, 2)) + + " % at 1e" + + str(np.round(np.log10(opt_lambda), 2)), +) +plt.xlabel("Regularization strength, $\log_{10}(\lambda)$") +plt.ylabel("Error rate (%)") +plt.title("Classification error") +plt.legend(["Training error", "Test error", "Test minimum"], loc="upper right") plt.ylim([0, 4]) plt.grid() -plt.show() +plt.show() -plt.figure(figsize=(8,8)) -plt.semilogx(lambda_interval, coefficient_norm,'k') -plt.ylabel('L2 Norm') -plt.xlabel('Regularization strength, $\log_{10}(\lambda)$') -plt.title('Parameter vector L2 norm') +plt.figure(figsize=(8, 8)) +plt.semilogx(lambda_interval, coefficient_norm, "k") +plt.ylabel("L2 Norm") +plt.xlabel("Regularization strength, $\log_{10}(\lambda)$") +plt.title("Parameter vector L2 norm") plt.grid() -plt.show() +plt.show() -print('Ran Exercise 9.1.1') \ No newline at end of file +print("Ran Exercise 9.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex8_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex8_2_2.py index 00ef20fd2a6df145b866b5d6b6f9b2371c36b010..55b34a37765dc9e48b1e14da52f2f99406c46eda 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex8_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex8_2_2.py @@ -1,142 +1,172 @@ # exercise 8.2.2 +import importlib_resources import matplotlib.pyplot as plt import numpy as np +import torch from scipy.io import loadmat from sklearn import model_selection -from toolbox_02450 import train_neural_net, draw_neural_net, visualize_decision_boundary -import torch -plt.rcParams.update({'font.size': 12}) + +from dtuimldmtools import ( + draw_neural_net, + train_neural_net, + visualize_decision_boundary, +) + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/xor.mat") + +plt.rcParams.update({"font.size": 12}) # read XOR DATA from matlab datafile -mat_data = loadmat('../Data/xor.mat') -X = mat_data['X'] -y = mat_data['y'] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"] -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0] for name in mat_data['classNames'].squeeze()] +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0] for name in mat_data["classNames"].squeeze()] N, M = X.shape C = len(classNames) # K-fold CrossValidation (4 folds here to speed up this example) K = 4 -CV = model_selection.KFold(K,shuffle=True) +CV = model_selection.KFold(K, shuffle=True) # Setup figure for display of the decision boundary for the several crossvalidation folds. -decision_boundaries = plt.figure(1, figsize=(10,10)) +decision_boundaries = plt.figure(1, figsize=(10, 10)) # Determine a size of a plot grid that fits visualizations for the chosen number # of cross-validation splits, if K=4, this is simply a 2-by-2 grid. -subplot_size_1 = int(np.floor(np.sqrt(K))) -subplot_size_2 = int(np.ceil(K/subplot_size_1)) +subplot_size_1 = int(np.floor(np.sqrt(K))) +subplot_size_2 = int(np.ceil(K / subplot_size_1)) # Set overall title for all of the subplots -plt.suptitle('Data and model decision boundaries', fontsize=20) +plt.suptitle("Data and model decision boundaries", fontsize=20) # Change spacing of subplots -plt.subplots_adjust(left=0, bottom=0, right=1, top=.9, wspace=.5, hspace=0.25) +plt.subplots_adjust(left=0, bottom=0, right=1, top=0.9, wspace=0.5, hspace=0.25) # Setup figure for display of learning curves and error rates in fold -summaries, summaries_axes = plt.subplots(1, 2, figsize=(10,5)) +summaries, summaries_axes = plt.subplots(1, 2, figsize=(10, 5)) # Make a list for storing assigned color of learning curve for up to K=10 -color_list = ['tab:orange', 'tab:green', 'tab:purple', 'tab:brown', 'tab:pink', - 'tab:gray', 'tab:olive', 'tab:cyan', 'tab:red', 'tab:blue'] +color_list = [ + "tab:orange", + "tab:green", + "tab:purple", + "tab:brown", + "tab:pink", + "tab:gray", + "tab:olive", + "tab:cyan", + "tab:red", + "tab:blue", +] # Define the model structure -n_hidden_units = 1 # number of hidden units in the signle hidden layer -# The lambda-syntax defines an anonymous function, which is used here to +n_hidden_units = 1 # number of hidden units in the signle hidden layer +# The lambda-syntax defines an anonymous function, which is used here to # make it easy to make new networks within each cross validation fold model = lambda: torch.nn.Sequential( - torch.nn.Linear(M, n_hidden_units), #M features to H hiden units - # 1st transfer function, either Tanh or ReLU: - torch.nn.Tanh(), #torch.nn.ReLU(), - torch.nn.Linear(n_hidden_units, 1), # H hidden units to 1 output neuron - torch.nn.Sigmoid() # final tranfer function - ) -# Since we're training a neural network for binary classification, we use a + torch.nn.Linear(M, n_hidden_units), # M features to H hiden units + # 1st transfer function, either Tanh or ReLU: + torch.nn.Tanh(), # torch.nn.ReLU(), + torch.nn.Linear(n_hidden_units, 1), # H hidden units to 1 output neuron + torch.nn.Sigmoid(), # final tranfer function +) +# Since we're training a neural network for binary classification, we use a # binary cross entropy loss (see the help(train_neural_net) for more on # the loss_fn input to the function) loss_fn = torch.nn.BCELoss() -# Train for a maximum of 10000 steps, or until convergence (see help for the +# Train for a maximum of 10000 steps, or until convergence (see help for the # function train_neural_net() for more on the tolerance/convergence)) max_iter = 10000 -print('Training model of type:\n{}\n'.format(str(model()))) +print("Training model of type:\n{}\n".format(str(model()))) # Do cross-validation: -errors = [] # make a list for storing generalizaition error in each loop -# Loop over each cross-validation split. The CV.split-method returns the -# indices to be used for training and testing in each split, and calling -# the enumerate-method with this simply returns this indices along with +errors = [] # make a list for storing generalizaition error in each loop +# Loop over each cross-validation split. The CV.split-method returns the +# indices to be used for training and testing in each split, and calling +# the enumerate-method with this simply returns this indices along with # a counter k: -for k, (train_index, test_index) in enumerate(CV.split(X,y)): - print('\nCrossvalidation fold: {0}/{1}'.format(k+1,K)) - - # Extract training and test set for current CV fold, +for k, (train_index, test_index) in enumerate(CV.split(X, y)): + print("\nCrossvalidation fold: {0}/{1}".format(k + 1, K)) + + # Extract training and test set for current CV fold, # and convert them to PyTorch tensors - X_train = torch.Tensor(X[train_index,:] ) - y_train = torch.Tensor(y[train_index] ) - X_test = torch.Tensor(X[test_index,:] ) - y_test = torch.Tensor(y[test_index] ) - - # Go to the file 'toolbox_02450.py' in the Tools sub-folder of the toolbox + X_train = torch.Tensor(X[train_index, :]) + y_train = torch.Tensor(y[train_index]) + X_test = torch.Tensor(X[test_index, :]) + y_test = torch.Tensor(y[test_index]) + + # Go to the file 'dtuimldmtools.py' in the Tools sub-folder of the toolbox # and see how the network is trained (search for 'def train_neural_net', # which is the place the function below is defined) - net, final_loss, learning_curve = train_neural_net(model, - loss_fn, - X=X_train, - y=y_train, - n_replicates=3, - max_iter=max_iter) - - print('\n\tBest loss: {}\n'.format(final_loss)) - + net, final_loss, learning_curve = train_neural_net( + model, loss_fn, X=X_train, y=y_train, n_replicates=3, max_iter=max_iter + ) + + print("\n\tBest loss: {}\n".format(final_loss)) + # Determine estimated class labels for test set - y_sigmoid = net(X_test) # activation of final note, i.e. prediction of network - y_test_est = (y_sigmoid > .5).type(dtype=torch.uint8) # threshold output of sigmoidal function + y_sigmoid = net(X_test) # activation of final note, i.e. prediction of network + y_test_est = (y_sigmoid > 0.5).type( + dtype=torch.uint8 + ) # threshold output of sigmoidal function y_test = y_test.type(dtype=torch.uint8) # Determine errors and error rate - e = (y_test_est != y_test) - error_rate = (sum(e).type(torch.float)/len(y_test)).data.numpy() - errors.append(error_rate) # store error rate for current CV fold - - # Make a subplot for current cross validation fold that displays the + e = y_test_est != y_test + error_rate = (sum(e).type(torch.float) / len(y_test)).data.numpy() + errors.append(error_rate) # store error rate for current CV fold + + # Make a subplot for current cross validation fold that displays the # decision boundary over the original data, "background color" corresponds # to the output of the sigmoidal transfer function (i.e. before threshold), - # white areas are areas of uncertainty, and a deaper red/blue means + # white areas are areas of uncertainty, and a deaper red/blue means # that the network "is more sure" of a given class. plt.figure(decision_boundaries.number) - plt.subplot(subplot_size_1,subplot_size_2,k+1) - plt.title('CV fold {0}'.format(k+1),color=color_list[k]) + plt.subplot(subplot_size_1, subplot_size_2, k + 1) + plt.title("CV fold {0}".format(k + 1), color=color_list[k]) predict = lambda x: net(torch.tensor(x, dtype=torch.float)).data.numpy() - visualize_decision_boundary(predict, X, y, # provide data, along with function for prediction - attributeNames, classNames, # provide information on attribute and class names - train=train_index, test=test_index, # provide information on partioning - show_legend=k==(K-1)) # only display legend for last plot - + visualize_decision_boundary( + predict, + X, + y, # provide data, along with function for prediction + attributeNames, + classNames, # provide information on attribute and class names + train=train_index, + test=test_index, # provide information on partioning + show_legend=k == (K - 1), + ) # only display legend for last plot + # Display the learning curve for the best net in the current fold - h, = summaries_axes[0].plot(learning_curve, color=color_list[k]) - h.set_label('CV fold {0}'.format(k+1)) - summaries_axes[0].set_xlabel('Iterations') + (h,) = summaries_axes[0].plot(learning_curve, color=color_list[k]) + h.set_label("CV fold {0}".format(k + 1)) + summaries_axes[0].set_xlabel("Iterations") summaries_axes[0].set_xlim((0, max_iter)) - summaries_axes[0].set_ylabel('Loss') - summaries_axes[0].set_title('Learning curves') - + summaries_axes[0].set_ylabel("Loss") + summaries_axes[0].set_title("Learning curves") + # Display the error rate across folds -summaries_axes[1].bar(np.arange(1, K+1), np.squeeze(np.asarray(errors)), color=color_list) -summaries_axes[1].set_xlabel('Fold'); -summaries_axes[1].set_xticks(np.arange(1, K+1)) -summaries_axes[1].set_ylabel('Error rate'); -summaries_axes[1].set_title('Test misclassification rates') - +summaries_axes[1].bar( + np.arange(1, K + 1), np.squeeze(np.asarray(errors)), color=color_list +) +summaries_axes[1].set_xlabel("Fold") +summaries_axes[1].set_xticks(np.arange(1, K + 1)) +summaries_axes[1].set_ylabel("Error rate") +summaries_axes[1].set_title("Test misclassification rates") + # Show the plots # plt.show(decision_boundaries.number) # try these lines if the following code fails (depends on package versions) # plt.show(summaries.number) plt.show() # Display a diagram of the best network in last fold -print('Diagram of best neural net in last fold:') -weights = [net[i].weight.data.numpy().T for i in [0,2]] -biases = [net[i].bias.data.numpy() for i in [0,2]] -tf = [str(net[i]) for i in [1,3]] +print("Diagram of best neural net in last fold:") +weights = [net[i].weight.data.numpy().T for i in [0, 2]] +biases = [net[i].bias.data.numpy() for i in [0, 2]] +tf = [str(net[i]) for i in [1, 3]] draw_neural_net(weights, biases, tf) # Print the average classification error rate -print('\nGeneralization error/average error rate: {0}%'.format(round(100*np.mean(errors),4))) +print( + "\nGeneralization error/average error rate: {0}%".format( + round(100 * np.mean(errors), 4) + ) +) -print('Ran exercise 8.2.2.') \ No newline at end of file +print("Ran exercise 8.2.2.") diff --git a/exercises/02450Toolbox_Python/Scripts/ex8_2_5.py b/exercises/02450Toolbox_Python/Scripts/ex8_2_5.py index 363b4b6c7d645f9bc169bed8076a1b790285583e..79889848a68f97952ade8e3a83401bad0676899b 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex8_2_5.py +++ b/exercises/02450Toolbox_Python/Scripts/ex8_2_5.py @@ -1,102 +1,125 @@ # exercise 8.2.5 +import importlib_resources import matplotlib.pyplot as plt import numpy as np -from scipy.io import loadmat import torch -from sklearn import model_selection -from toolbox_02450 import train_neural_net, draw_neural_net from scipy import stats +from scipy.io import loadmat +from sklearn import model_selection + +from dtuimldmtools import draw_neural_net, train_neural_net + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine2.mat') -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] -X = mat_data['X'] -y = mat_data['y'] +mat_data = loadmat(filename) +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] +X = mat_data["X"] +y = mat_data["y"] -#Downsample: X = X[1:20,:] y = y[1:20,:] +# Downsample: X = X[1:20,:] y = y[1:20,:] N, M = X.shape C = 2 # Normalize data -X = stats.zscore(X); +X = stats.zscore(X) # Parameters for neural network classifier -n_hidden_units = 2 # number of hidden units -n_replicates = 2 # number of networks trained in each k-fold -max_iter = 10000 # stop criterion 2 (max epochs in training) +n_hidden_units = 2 # number of hidden units +n_replicates = 2 # number of networks trained in each k-fold +max_iter = 10000 # stop criterion 2 (max epochs in training) # K-fold crossvalidation -K = 3 # only five folds to speed up this example +K = 3 # only five folds to speed up this example CV = model_selection.KFold(K, shuffle=True) # Make figure for holding summaries (errors and learning curves) -summaries, summaries_axes = plt.subplots(1,2, figsize=(10,5)) +summaries, summaries_axes = plt.subplots(1, 2, figsize=(10, 5)) # Make a list for storing assigned color of learning curve for up to K=10 -color_list = ['tab:orange', 'tab:green', 'tab:purple', 'tab:brown', 'tab:pink', - 'tab:gray', 'tab:olive', 'tab:cyan', 'tab:red', 'tab:blue'] +color_list = [ + "tab:orange", + "tab:green", + "tab:purple", + "tab:brown", + "tab:pink", + "tab:gray", + "tab:olive", + "tab:cyan", + "tab:red", + "tab:blue", +] # Define the model, see also Exercise 8.2.2-script for more information. model = lambda: torch.nn.Sequential( - torch.nn.Linear(M, n_hidden_units), #M features to H hiden units - torch.nn.Tanh(), # 1st transfer function, - torch.nn.Linear(n_hidden_units, 1), # H hidden units to 1 output neuron - torch.nn.Sigmoid() # final tranfer function - ) + torch.nn.Linear(M, n_hidden_units), # M features to H hiden units + torch.nn.Tanh(), # 1st transfer function, + torch.nn.Linear(n_hidden_units, 1), # H hidden units to 1 output neuron + torch.nn.Sigmoid(), # final tranfer function +) loss_fn = torch.nn.BCELoss() -print('Training model of type:\n\n{}\n'.format(str(model()))) -errors = [] # make a list for storing generalizaition error in each loop -for k, (train_index, test_index) in enumerate(CV.split(X,y)): - print('\nCrossvalidation fold: {0}/{1}'.format(k+1,K)) - +print("Training model of type:\n\n{}\n".format(str(model()))) +errors = [] # make a list for storing generalizaition error in each loop +for k, (train_index, test_index) in enumerate(CV.split(X, y)): + print("\nCrossvalidation fold: {0}/{1}".format(k + 1, K)) + # Extract training and test set for current CV fold, convert to tensors - X_train = torch.Tensor(X[train_index,:]) + X_train = torch.Tensor(X[train_index, :]) y_train = torch.Tensor(y[train_index]) - X_test = torch.Tensor(X[test_index,:]) + X_test = torch.Tensor(X[test_index, :]) y_test = torch.Tensor(y[test_index]) - + # Train the net on training data - net, final_loss, learning_curve = train_neural_net(model, - loss_fn, - X=X_train, - y=y_train, - n_replicates=n_replicates, - max_iter=max_iter) - - print('\n\tBest loss: {}\n'.format(final_loss)) - + net, final_loss, learning_curve = train_neural_net( + model, + loss_fn, + X=X_train, + y=y_train, + n_replicates=n_replicates, + max_iter=max_iter, + ) + + print("\n\tBest loss: {}\n".format(final_loss)) + # Determine estimated class labels for test set y_sigmoid = net(X_test) - y_test_est = (y_sigmoid>.5).type(dtype=torch.uint8) + y_test_est = (y_sigmoid > 0.5).type(dtype=torch.uint8) # Determine errors and errors y_test = y_test.type(dtype=torch.uint8) e = y_test_est != y_test - error_rate = (sum(e).type(torch.float)/len(y_test)).data.numpy() - errors.append(error_rate) # store error rate for current CV fold - + error_rate = (sum(e).type(torch.float) / len(y_test)).data.numpy() + errors.append(error_rate) # store error rate for current CV fold + # Display the learning curve for the best net in the current fold - h, = summaries_axes[0].plot(learning_curve, color=color_list[k]) - h.set_label('CV fold {0}'.format(k+1)) - summaries_axes[0].set_xlabel('Iterations') + (h,) = summaries_axes[0].plot(learning_curve, color=color_list[k]) + h.set_label("CV fold {0}".format(k + 1)) + summaries_axes[0].set_xlabel("Iterations") summaries_axes[0].set_xlim((0, max_iter)) - summaries_axes[0].set_ylabel('Loss') - summaries_axes[0].set_title('Learning curves') - + summaries_axes[0].set_ylabel("Loss") + summaries_axes[0].set_title("Learning curves") + # Display the error rate across folds -summaries_axes[1].bar(np.arange(1, K+1), np.squeeze(np.asarray(errors)), color=color_list) -summaries_axes[1].set_xlabel('Fold'); -summaries_axes[1].set_xticks(np.arange(1, K+1)) -summaries_axes[1].set_ylabel('Error rate'); -summaries_axes[1].set_title('Test misclassification rates') - -print('Diagram of best neural net in last fold:') -weights = [net[i].weight.data.numpy().T for i in [0,2]] -biases = [net[i].bias.data.numpy() for i in [0,2]] -tf = [str(net[i]) for i in [1,3]] +summaries_axes[1].bar( + np.arange(1, K + 1), np.squeeze(np.asarray(errors)), color=color_list +) +summaries_axes[1].set_xlabel("Fold") +summaries_axes[1].set_xticks(np.arange(1, K + 1)) +summaries_axes[1].set_ylabel("Error rate") +summaries_axes[1].set_title("Test misclassification rates") + +print("Diagram of best neural net in last fold:") +weights = [net[i].weight.data.numpy().T for i in [0, 2]] +biases = [net[i].bias.data.numpy() for i in [0, 2]] +tf = [str(net[i]) for i in [1, 3]] draw_neural_net(weights, biases, tf, attribute_names=attributeNames) # Print the average classification error rate -print('\nGeneralization error/average error rate: {0}%'.format(round(100*np.mean(errors),4))) +print( + "\nGeneralization error/average error rate: {0}%".format( + round(100 * np.mean(errors), 4) + ) +) -print('Ran Exercise 8.2.5') \ No newline at end of file +print("Ran Exercise 8.2.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex8_2_6.py b/exercises/02450Toolbox_Python/Scripts/ex8_2_6.py index 19c86cb86aadc57e4b6ec8b7b944e993b3d1f71f..1b017c15424a22a49820ad2ab599aa0e8d8196c4 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex8_2_6.py +++ b/exercises/02450Toolbox_Python/Scripts/ex8_2_6.py @@ -1,129 +1,154 @@ # exercise 8.2.6 +import importlib_resources import matplotlib.pyplot as plt import numpy as np -from scipy.io import loadmat import torch -from sklearn import model_selection -from toolbox_02450 import train_neural_net, draw_neural_net from scipy import stats +from scipy.io import loadmat +from sklearn import model_selection + +from dtuimldmtools import draw_neural_net, train_neural_net + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine2.mat') -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] -X = mat_data['X'] -y = X[:,[10]] # alcohol contents (target) -X = X[:,:10] # the rest of features +mat_data = loadmat(filename) +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] +X = mat_data["X"] +y = X[:, [10]] # alcohol contents (target) +X = X[:, :10] # the rest of features N, M = X.shape C = 2 # Normalize data X = stats.zscore(X) - + ## Normalize and compute PCA (change to True to experiment with PCA preprocessing) do_pca_preprocessing = False if do_pca_preprocessing: - Y = stats.zscore(X,0) - U,S,V = np.linalg.svd(Y,full_matrices=False) + Y = stats.zscore(X, 0) + U, S, V = np.linalg.svd(Y, full_matrices=False) V = V.T - #Components to be included as features + # Components to be included as features k_pca = 3 - X = X @ V[:,:k_pca] + X = X @ V[:, :k_pca] N, M = X.shape # Parameters for neural network classifier -n_hidden_units = 2 # number of hidden units -n_replicates = 1 # number of networks trained in each k-fold +n_hidden_units = 2 # number of hidden units +n_replicates = 1 # number of networks trained in each k-fold max_iter = 10000 # K-fold crossvalidation -K = 3 # only three folds to speed up this example +K = 3 # only three folds to speed up this example CV = model_selection.KFold(K, shuffle=True) # Setup figure for display of learning curves and error rates in fold -summaries, summaries_axes = plt.subplots(1,2, figsize=(10,5)) +summaries, summaries_axes = plt.subplots(1, 2, figsize=(10, 5)) # Make a list for storing assigned color of learning curve for up to K=10 -color_list = ['tab:orange', 'tab:green', 'tab:purple', 'tab:brown', 'tab:pink', - 'tab:gray', 'tab:olive', 'tab:cyan', 'tab:red', 'tab:blue'] +color_list = [ + "tab:orange", + "tab:green", + "tab:purple", + "tab:brown", + "tab:pink", + "tab:gray", + "tab:olive", + "tab:cyan", + "tab:red", + "tab:blue", +] # Define the model model = lambda: torch.nn.Sequential( - torch.nn.Linear(M, n_hidden_units), #M features to n_hidden_units - torch.nn.Tanh(), # 1st transfer function, - torch.nn.Linear(n_hidden_units, 1), # n_hidden_units to 1 output neuron - # no final tranfer function, i.e. "linear output" - ) -loss_fn = torch.nn.MSELoss() # notice how this is now a mean-squared-error loss - -print('Training model of type:\n\n{}\n'.format(str(model()))) -errors = [] # make a list for storing generalizaition error in each loop -for (k, (train_index, test_index)) in enumerate(CV.split(X,y)): - print('\nCrossvalidation fold: {0}/{1}'.format(k+1,K)) - + torch.nn.Linear(M, n_hidden_units), # M features to n_hidden_units + torch.nn.Tanh(), # 1st transfer function, + torch.nn.Linear(n_hidden_units, 1), # n_hidden_units to 1 output neuron + # no final tranfer function, i.e. "linear output" +) +loss_fn = torch.nn.MSELoss() # notice how this is now a mean-squared-error loss + +print("Training model of type:\n\n{}\n".format(str(model()))) +errors = [] # make a list for storing generalizaition error in each loop +for k, (train_index, test_index) in enumerate(CV.split(X, y)): + print("\nCrossvalidation fold: {0}/{1}".format(k + 1, K)) + # Extract training and test set for current CV fold, convert to tensors - X_train = torch.Tensor(X[train_index,:]) + X_train = torch.Tensor(X[train_index, :]) y_train = torch.Tensor(y[train_index]) - X_test = torch.Tensor(X[test_index,:]) + X_test = torch.Tensor(X[test_index, :]) y_test = torch.Tensor(y[test_index]) - + # Train the net on training data - net, final_loss, learning_curve = train_neural_net(model, - loss_fn, - X=X_train, - y=y_train, - n_replicates=n_replicates, - max_iter=max_iter) - - print('\n\tBest loss: {}\n'.format(final_loss)) - + net, final_loss, learning_curve = train_neural_net( + model, + loss_fn, + X=X_train, + y=y_train, + n_replicates=n_replicates, + max_iter=max_iter, + ) + + print("\n\tBest loss: {}\n".format(final_loss)) + # Determine estimated class labels for test set y_test_est = net(X_test) - + # Determine errors and errors - se = (y_test_est.float()-y_test.float())**2 # squared error - mse = (sum(se).type(torch.float)/len(y_test)).data.numpy() #mean - errors.append(mse) # store error rate for current CV fold - + se = (y_test_est.float() - y_test.float()) ** 2 # squared error + mse = (sum(se).type(torch.float) / len(y_test)).data.numpy() # mean + errors.append(mse) # store error rate for current CV fold + # Display the learning curve for the best net in the current fold - h, = summaries_axes[0].plot(learning_curve, color=color_list[k]) - h.set_label('CV fold {0}'.format(k+1)) - summaries_axes[0].set_xlabel('Iterations') + (h,) = summaries_axes[0].plot(learning_curve, color=color_list[k]) + h.set_label("CV fold {0}".format(k + 1)) + summaries_axes[0].set_xlabel("Iterations") summaries_axes[0].set_xlim((0, max_iter)) - summaries_axes[0].set_ylabel('Loss') - summaries_axes[0].set_title('Learning curves') + summaries_axes[0].set_ylabel("Loss") + summaries_axes[0].set_title("Learning curves") # Display the MSE across folds -summaries_axes[1].bar(np.arange(1, K+1), np.squeeze(np.asarray(errors)), color=color_list) -summaries_axes[1].set_xlabel('Fold') -summaries_axes[1].set_xticks(np.arange(1, K+1)) -summaries_axes[1].set_ylabel('MSE') -summaries_axes[1].set_title('Test mean-squared-error') - -print('Diagram of best neural net in last fold:') -weights = [net[i].weight.data.numpy().T for i in [0,2]] -biases = [net[i].bias.data.numpy() for i in [0,2]] -tf = [str(net[i]) for i in [1,2]] +summaries_axes[1].bar( + np.arange(1, K + 1), np.squeeze(np.asarray(errors)), color=color_list +) +summaries_axes[1].set_xlabel("Fold") +summaries_axes[1].set_xticks(np.arange(1, K + 1)) +summaries_axes[1].set_ylabel("MSE") +summaries_axes[1].set_title("Test mean-squared-error") + +print("Diagram of best neural net in last fold:") +weights = [net[i].weight.data.numpy().T for i in [0, 2]] +biases = [net[i].bias.data.numpy() for i in [0, 2]] +tf = [str(net[i]) for i in [1, 2]] draw_neural_net(weights, biases, tf, attribute_names=attributeNames) # Print the average classification error rate -print('\nEstimated generalization error, RMSE: {0}'.format(round(np.sqrt(np.mean(errors)), 4))) +print( + "\nEstimated generalization error, RMSE: {0}".format( + round(np.sqrt(np.mean(errors)), 4) + ) +) # When dealing with regression outputs, a simple way of looking at the quality -# of predictions visually is by plotting the estimated value as a function of -# the true/known value - these values should all be along a straight line "y=x", +# of predictions visually is by plotting the estimated value as a function of +# the true/known value - these values should all be along a straight line "y=x", # and if the points are above the line, the model overestimates, whereas if the # points are below the y=x line, then the model underestimates the value -plt.figure(figsize=(10,10)) -y_est = y_test_est.data.numpy(); y_true = y_test.data.numpy() -axis_range = [np.min([y_est, y_true])-1,np.max([y_est, y_true])+1] -plt.plot(axis_range,axis_range,'k--') -plt.plot(y_true, y_est,'ob',alpha=.25) -plt.legend(['Perfect estimation','Model estimations']) -plt.title('Alcohol content: estimated versus true value (for last CV-fold)') -plt.ylim(axis_range); plt.xlim(axis_range) -plt.xlabel('True value') -plt.ylabel('Estimated value') +plt.figure(figsize=(10, 10)) +y_est = y_test_est.data.numpy() +y_true = y_test.data.numpy() +axis_range = [np.min([y_est, y_true]) - 1, np.max([y_est, y_true]) + 1] +plt.plot(axis_range, axis_range, "k--") +plt.plot(y_true, y_est, "ob", alpha=0.25) +plt.legend(["Perfect estimation", "Model estimations"]) +plt.title("Alcohol content: estimated versus true value (for last CV-fold)") +plt.ylim(axis_range) +plt.xlim(axis_range) +plt.xlabel("True value") +plt.ylabel("Estimated value") plt.grid() plt.show() -print('Ran Exercise 8.2.5') \ No newline at end of file +print("Ran Exercise 8.2.5") diff --git a/exercises/02450Toolbox_Python/Scripts/ex8_3_1.py b/exercises/02450Toolbox_Python/Scripts/ex8_3_1.py index 6645fd66eff4f587a1f61967d03419975afb91a9..e926d6751a9fc7b99aa786d1b217aa55a9a0e6d8 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex8_3_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex8_3_1.py @@ -1,63 +1,78 @@ # exercise 8.3.1 Fit neural network classifiers using softmax output weighting -from matplotlib.pyplot import figure, show, title -from scipy.io import loadmat -from toolbox_02450 import dbplotf, train_neural_net, visualize_decision_boundary +import importlib_resources import numpy as np import torch +from matplotlib.pyplot import figure, show, title +from scipy.io import loadmat + +from dtuimldmtools import dbplotf, train_neural_net, visualize_decision_boundary + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat") + + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth1.mat') -X = mat_data['X'] -X = X - np.ones((X.shape[0],1)) * np.mean(X,0) -X_train = mat_data['X_train'] -X_test = mat_data['X_test'] -y = mat_data['y'].squeeze() -y_train = mat_data['y_train'].squeeze() -y_test = mat_data['y_test'].squeeze() - -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +X = X - np.ones((X.shape[0], 1)) * np.mean(X, 0) +X_train = mat_data["X_train"] +X_test = mat_data["X_test"] +y = mat_data["y"].squeeze() +y_train = mat_data["y_train"].squeeze() +y_test = mat_data["y_test"].squeeze() + +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) -#%% Model fitting and prediction +# %% Model fitting and prediction # Define the model structure -n_hidden_units = 5 # number of hidden units in the signle hidden layer +n_hidden_units = 5 # number of hidden units in the signle hidden layer model = lambda: torch.nn.Sequential( - torch.nn.Linear(M, n_hidden_units), #M features to H hiden units - torch.nn.ReLU(), # 1st transfer function - # Output layer: - # H hidden units to C classes - # the nodes and their activation before the transfer - # function is often referred to as logits/logit output - torch.nn.Linear(n_hidden_units, C), # C logits - # To obtain normalised "probabilities" of each class - # we use the softmax-funtion along the "class" dimension - # (i.e. not the dimension describing observations) - torch.nn.Softmax(dim=1) # final tranfer function, normalisation of logit output - ) + torch.nn.Linear(M, n_hidden_units), # M features to H hiden units + torch.nn.ReLU(), # 1st transfer function + # Output layer: + # H hidden units to C classes + # the nodes and their activation before the transfer + # function is often referred to as logits/logit output + torch.nn.Linear(n_hidden_units, C), # C logits + # To obtain normalised "probabilities" of each class + # we use the softmax-funtion along the "class" dimension + # (i.e. not the dimension describing observations) + torch.nn.Softmax(dim=1), # final tranfer function, normalisation of logit output +) # Since we're training a multiclass problem, we cannot use binary cross entropy, # but instead use the general cross entropy loss: loss_fn = torch.nn.CrossEntropyLoss() # Train the network: -net, _, _ = train_neural_net(model, loss_fn, - X=torch.tensor(X_train, dtype=torch.float), - y=torch.tensor(y_train, dtype=torch.long), - n_replicates=3, - max_iter=10000) +net, _, _ = train_neural_net( + model, + loss_fn, + X=torch.tensor(X_train, dtype=torch.float), + y=torch.tensor(y_train, dtype=torch.long), + n_replicates=3, + max_iter=10000, +) # Determine probability of each class using trained network softmax_logits = net(torch.tensor(X_test, dtype=torch.float)) # Get the estimated class as the class with highest probability (argmax on softmax_logits) -y_test_est = (torch.max(softmax_logits, dim=1)[1]).data.numpy() +y_test_est = (torch.max(softmax_logits, dim=1)[1]).data.numpy() # Determine errors -e = (y_test_est != y_test) -print('Number of miss-classifications for ANN:\n\t {0} out of {1}'.format(sum(e),len(e))) +e = y_test_est != y_test +print( + "Number of miss-classifications for ANN:\n\t {0} out of {1}".format(sum(e), len(e)) +) -predict = lambda x: (torch.max(net(torch.tensor(x, dtype=torch.float)), dim=1)[1]).data.numpy() -figure(1,figsize=(9,9)) -visualize_decision_boundary(predict, [X_train, X_test], [y_train, y_test], attributeNames, classNames) -title('ANN decision boundaries') +predict = lambda x: ( + torch.max(net(torch.tensor(x, dtype=torch.float)), dim=1)[1] +).data.numpy() +figure(1, figsize=(9, 9)) +visualize_decision_boundary( + predict, [X_train, X_test], [y_train, y_test], attributeNames, classNames +) +title("ANN decision boundaries") show() -print('Ran Exercise 8.3.1') \ No newline at end of file +print("Ran Exercise 8.3.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex8_3_2.py b/exercises/02450Toolbox_Python/Scripts/ex8_3_2.py index 7d57d05c2e82c524c255e5318a442c6c0e3dc112..958582f5e62fd5a78cc315195acd666fee79560c 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex8_3_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex8_3_2.py @@ -1,42 +1,54 @@ # exercise 8.3.2 Fit multinomial regression -from matplotlib.pyplot import figure, show, title -from scipy.io import loadmat -from toolbox_02450 import dbplotf, train_neural_net, visualize_decision_boundary +import importlib_resources import numpy as np import sklearn.linear_model as lm +from matplotlib.pyplot import figure, show, title +from scipy.io import loadmat + +from dtuimldmtools import dbplotf, train_neural_net, visualize_decision_boundary + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth1.mat") # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth1.mat') -X = mat_data['X'] -X = X - np.ones((X.shape[0],1)) * np.mean(X,0) -X_train = mat_data['X_train'] -X_test = mat_data['X_test'] -y = mat_data['y'].squeeze() -y_train = mat_data['y_train'].squeeze() -y_test = mat_data['y_test'].squeeze() - -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +X = X - np.ones((X.shape[0], 1)) * np.mean(X, 0) +X_train = mat_data["X_train"] +X_test = mat_data["X_test"] +y = mat_data["y"].squeeze() +y_train = mat_data["y_train"].squeeze() +y_test = mat_data["y_test"].squeeze() + +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) -#%% Model fitting and prediction +# %% Model fitting and prediction # Multinomial logistic regression -logreg = lm.LogisticRegression(solver='lbfgs', multi_class='multinomial', tol=1e-4, random_state=1) -logreg.fit(X_train,y_train) +logreg = lm.LogisticRegression( + solver="lbfgs", multi_class="multinomial", tol=1e-4, random_state=1 +) +logreg.fit(X_train, y_train) -# To display coefficients use print(logreg.coef_). For a 4 class problem with a +# To display coefficients use print(logreg.coef_). For a 4 class problem with a # feature space, these weights will have shape (4, 2). # Number of miss-classifications -print('Number of miss-classifications for Multinormal regression:\n\t {0} out of {1}'.format(np.sum(logreg.predict(X_test)!=y_test),len(y_test))) - -predict = lambda x: np.argmax(logreg.predict_proba(x),1) -figure(2,figsize=(9,9)) -visualize_decision_boundary(predict, [X_train, X_test], [y_train, y_test], attributeNames, classNames) -title('LogReg decision boundaries') +print( + "Number of miss-classifications for Multinormal regression:\n\t {0} out of {1}".format( + np.sum(logreg.predict(X_test) != y_test), len(y_test) + ) +) + +predict = lambda x: np.argmax(logreg.predict_proba(x), 1) +figure(2, figsize=(9, 9)) +visualize_decision_boundary( + predict, [X_train, X_test], [y_train, y_test], attributeNames, classNames +) +title("LogReg decision boundaries") show() -print('Ran Exercise 8.3.2') \ No newline at end of file +print("Ran Exercise 8.3.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex8_3_3.py b/exercises/02450Toolbox_Python/Scripts/ex8_3_3.py index 9cbbaa6fd95affa0d3d3d938cad47525b3947c08..4a6952c07107ae02aaf81196105b7db5483c8feb 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex8_3_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex8_3_3.py @@ -1,26 +1,30 @@ # exercise 8.3.3 Fit regularized multinomial regression +import importlib_resources import matplotlib.pyplot as plt -from scipy.io import loadmat -from toolbox_02450 import dbplotf, train_neural_net, visualize_decision_boundary import numpy as np import sklearn.linear_model as lm +from scipy.io import loadmat + +from dtuimldmtools import dbplotf, train_neural_net, visualize_decision_boundary + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/data/synth2.mat") # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth2.mat') -X = mat_data['X'] -X_train = mat_data['X_train'] -X_test = mat_data['X_test'] -y = mat_data['y'].squeeze() -y_train = mat_data['y_train'].squeeze() -y_test = mat_data['y_test'].squeeze() +mat_data = loadmat(filename) +X = mat_data["X"] +X_train = mat_data["X_train"] +X_test = mat_data["X_test"] +y = mat_data["y"].squeeze() +y_train = mat_data["y_train"].squeeze() +y_test = mat_data["y_test"].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) -#%% Model fitting and prediction +# %% Model fitting and prediction # Standardize data based on training set mu = np.mean(X_train, 0) sigma = np.std(X_train, 0) @@ -29,29 +33,36 @@ X_test = (X_test - mu) / sigma # Fit multinomial logistic regression model regularization_strength = 1e-3 -#Try a high strength, e.g. 1e5, especially for synth2, synth3 and synth4 -mdl = lm.LogisticRegression(solver='lbfgs', multi_class='multinomial', - tol=1e-4, random_state=1, - penalty='l2', C=1/regularization_strength) -mdl.fit(X_train,y_train) +# Try a high strength, e.g. 1e5, especially for synth2, synth3 and synth4 +mdl = lm.LogisticRegression( + solver="lbfgs", + multi_class="multinomial", + tol=1e-4, + random_state=1, + penalty="l2", + C=1 / regularization_strength, +) +mdl.fit(X_train, y_train) y_test_est = mdl.predict(X_test) -test_error_rate = np.sum(y_test_est!=y_test) / len(y_test) +test_error_rate = np.sum(y_test_est != y_test) / len(y_test) -predict = lambda x: np.argmax(mdl.predict_proba(x),1) -plt.figure(2,figsize=(9,9)) -visualize_decision_boundary(predict, [X_train, X_test], [y_train, y_test], attributeNames, classNames) -plt.title('LogReg decision boundaries') +predict = lambda x: np.argmax(mdl.predict_proba(x), 1) +plt.figure(2, figsize=(9, 9)) +visualize_decision_boundary( + predict, [X_train, X_test], [y_train, y_test], attributeNames, classNames +) +plt.title("LogReg decision boundaries") plt.show() # Number of miss-classifications -print('Error rate: \n\t {0} % out of {1}'.format(test_error_rate*100,len(y_test))) +print("Error rate: \n\t {0} % out of {1}".format(test_error_rate * 100, len(y_test))) # %% -plt.figure(2, figsize=(9,9)) -plt.hist([y_train, y_test, y_test_est], color=['red','green','blue'], density=True) -plt.legend(['Training labels','Test labels','Estimated test labels']) +plt.figure(2, figsize=(9, 9)) +plt.hist([y_train, y_test, y_test_est], color=["red", "green", "blue"], density=True) +plt.legend(["Training labels", "Test labels", "Estimated test labels"]) -print('Ran Exercise 8.3.2') \ No newline at end of file +print("Ran Exercise 8.3.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_1_1.py b/exercises/02450Toolbox_Python/Scripts/ex9_1_1.py index b49435a986f7a02bd664bc37680df036d361dd30..cd0cdc5c0721ec6581c6ba07be43e8ac4b3a299c 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_1_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_1_1.py @@ -1,19 +1,21 @@ # exercise 9.1.1 -from matplotlib.pyplot import figure, show +import importlib_resources import numpy as np +from matplotlib.pyplot import figure, show from scipy.io import loadmat -from toolbox_02450 import dbplot, dbprobplot, bootstrap -from toolbox_02450.bin_classifier_ensemble import BinClassifierEnsemble from sklearn.linear_model import LogisticRegression +from dtuimldmtools import BinClassifierEnsemble, bootstrap, dbplot, dbprobplot + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth5.mat") # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth5.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) @@ -23,18 +25,17 @@ C = len(classNames) L = 100 # Weights for selecting samples in each bootstrap -weights = np.ones((N,1),dtype=float)/N +weights = np.ones((N, 1), dtype=float) / N # Storage of trained log.reg. classifiers fitted in each bootstrap -logits = [0]*L +logits = [0] * L votes = np.zeros((N,)) # For each round of bagging for l in range(L): - # Extract training set by random sampling with replacement from X and y X_train, y_train = bootstrap(X, y, N, weights) - + # Fit logistic regression model to training data and save result logit_classifier = LogisticRegression() logit_classifier.fit(X_train, y_train) @@ -42,20 +43,22 @@ for l in range(L): y_est = logit_classifier.predict(X).T votes = votes + y_est - ErrorRate = (y!=y_est).sum(dtype=float)/N - print('Error rate: {:2.2f}%'.format(ErrorRate*100)) - + ErrorRate = (y != y_est).sum(dtype=float) / N + print("Error rate: {:2.2f}%".format(ErrorRate * 100)) + # Estimated value of class labels (using 0.5 as threshold) by majority voting -y_est_ensemble = votes>(L/2) +y_est_ensemble = votes > (L / 2) # Compute error rate -ErrorRate = (y!=y_est_ensemble).sum(dtype=float)/N -print('Error rate: {:3.2f}%'.format(ErrorRate*100)) +ErrorRate = (y != y_est_ensemble).sum(dtype=float) / N +print("Error rate: {:3.2f}%".format(ErrorRate * 100)) ce = BinClassifierEnsemble(logits) -figure(1); dbprobplot(ce, X, y, 'auto', resolution=200) -figure(2); dbplot(ce, X, y, 'auto', resolution=200) +figure(1) +dbprobplot(ce, X, y, "auto", resolution=200) +figure(2) +dbplot(ce, X, y, "auto", resolution=200) show() -print('Ran Exercise 9.1.1') \ No newline at end of file +print("Ran Exercise 9.1.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_1_2.py b/exercises/02450Toolbox_Python/Scripts/ex9_1_2.py index 0ba86f12470fe76ee9129b27a11c3a01e2114f8d..a3ebddbc97f19582253abcb6c2d4b80a055e06f2 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_1_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_1_2.py @@ -1,19 +1,23 @@ # exercise 9.1.2 +import importlib_resources import matplotlib.pyplot as plt import numpy as np from scipy.io import loadmat -from toolbox_02450 import dbplot, dbprobplot, bootstrap -from toolbox_02450.bin_classifier_ensemble import BinClassifierEnsemble from sklearn.linear_model import LogisticRegression +from dtuimldmtools import BinClassifierEnsemble, bootstrap, dbplot, dbprobplot + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth5.mat") + + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth5.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) @@ -23,78 +27,80 @@ C = len(classNames) L = 100 # Weights for selecting samples in each bootstrap -weights = np.ones((N,),dtype=float)/N +weights = np.ones((N,), dtype=float) / N # Storage of trained log.reg. classifiers fitted in each bootstrap -logits = [0]*L -alpha = np.ones( (L,) ) -votes = np.zeros((N,1)) +logits = [0] * L +alpha = np.ones((L,)) +votes = np.zeros((N, 1)) epsi = 0 -y_all = np.zeros((N,L)) +y_all = np.zeros((N, L)) y = y > 0.5 # For each round of boosting for l in range(L): - # Extract training set by random sampling with replacement from X and y - while True : - # not a thing of beauty, however log.reg. fails if presented with less than two classes. - X_train, y_train = bootstrap(X, y, N, weights) - if not (all(y_train==0) or all(y_train == 1)) : break - + while True: + # not a thing of beauty, however log.reg. fails if presented with less than two classes. + X_train, y_train = bootstrap(X, y, N, weights) + if not (all(y_train == 0) or all(y_train == 1)): + break + # Fit logistic regression model to training data and save result - # turn off regularization with C. + # turn off regularization with C. logit_classifier = LogisticRegression(C=1000) - logit_classifier.fit(X_train, y_train ) + logit_classifier.fit(X_train, y_train) logits[l] = logit_classifier y_est = logit_classifier.predict(X).T > 0.5 - - y_all[:,l] = 1.0 * y_est - v = (y_est != y).T - ErrorRate = np.multiply(weights,v).sum() + + y_all[:, l] = 1.0 * y_est + v = (y_est != y).T + ErrorRate = np.multiply(weights, v).sum() epsi = ErrorRate - - alphai = 0.5 * np.log( (1-epsi)/epsi) - - weights[y_est == y] = weights[y_est == y] * np.exp( -alphai ) - weights[y_est != y] = weights[y_est != y] * np.exp( alphai ) - + + alphai = 0.5 * np.log((1 - epsi) / epsi) + + weights[y_est == y] = weights[y_est == y] * np.exp(-alphai) + weights[y_est != y] = weights[y_est != y] * np.exp(alphai) + weights = weights / sum(weights) - + votes = votes + y_est alpha[l] = alphai - print('Error rate: {:2.2f}%'.format(ErrorRate*100)) - - + print("Error rate: {:2.2f}%".format(ErrorRate * 100)) + + # Estimated value of class labels (using 0.5 as threshold) by majority voting -alpha = alpha/sum(alpha) +alpha = alpha / sum(alpha) y_est_ensemble = y_all @ alpha > 0.5 -#y_est_ensemble = votes > (L/2) -#y_est_ensemble = mat(y_all) * mat(alpha) - (1-mat(y_all)) * mat(alpha) > 0 -ErrorRateEnsemble = sum(y_est_ensemble != y)/N +# y_est_ensemble = votes > (L/2) +# y_est_ensemble = mat(y_all) * mat(alpha) - (1-mat(y_all)) * mat(alpha) > 0 +ErrorRateEnsemble = sum(y_est_ensemble != y) / N # Compute error rate -#ErrorRate = (y!=y_est_ensemble).sum(dtype=float)/N -print('Error rate for ensemble classifier: {:.1f}%'.format(ErrorRateEnsemble*100)) - -ce = BinClassifierEnsemble(logits,alpha) -#ce = BinClassifierEnsemble(logits) # What happens if alpha is not included? -plt.figure(1); dbprobplot(ce, X, y, 'auto', resolution=200) -plt.figure(2); dbplot(ce, X, y, 'auto', resolution=200) -#plt.figure(3); plt.plot(alpha); - -#%% -plt.figure(4,figsize=(8,8)) +# ErrorRate = (y!=y_est_ensemble).sum(dtype=float)/N +print("Error rate for ensemble classifier: {:.1f}%".format(ErrorRateEnsemble * 100)) + +ce = BinClassifierEnsemble(logits, alpha) +# ce = BinClassifierEnsemble(logits) # What happens if alpha is not included? +plt.figure(1) +dbprobplot(ce, X, y, "auto", resolution=200) +plt.figure(2) +dbplot(ce, X, y, "auto", resolution=200) +# plt.figure(3); plt.plot(alpha); + +# %% +plt.figure(4, figsize=(8, 8)) for i in range(2): - plt.plot(X[ (y_est_ensemble==i),0],X[ (y_est_ensemble==i),1],'br'[i] + 'o') + plt.plot(X[(y_est_ensemble == i), 0], X[(y_est_ensemble == i), 1], "br"[i] + "o") ## Incomment the below lines to investigate miss-classifications -#for i in range(2): +# for i in range(2): # plt.plot(X[ (y==i),0],X[ (y==i),1],'br'[i] + '.') -plt.xlabel('Feature 1') -plt.ylabel('Feature 2') +plt.xlabel("Feature 1") +plt.ylabel("Feature 2") plt.show() -print('Ran Exercise 9.1.2') \ No newline at end of file +print("Ran Exercise 9.1.2") diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_1_3.py b/exercises/02450Toolbox_Python/Scripts/ex9_1_3.py index 3acfd65261d2e90d53dfeef97d9142441f679921..bba69f4da4a880649c0e5e29bbc1187319c7ffb7 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_1_3.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_1_3.py @@ -1,16 +1,21 @@ # exercise 9.1.3 +import importlib_resources from matplotlib.pyplot import figure, show from scipy.io import loadmat -from toolbox_02450 import dbplot, dbprobplot from sklearn.ensemble import RandomForestClassifier +from dtuimldmtools import dbplot, dbprobplot + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/synth7.mat") + + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/synth7.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'].squeeze()] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"].squeeze()] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) @@ -25,13 +30,15 @@ y_est = rf_classifier.predict(X).T y_est_prob = rf_classifier.predict_proba(X).T # Compute classification error -ErrorRate = (y!=y_est).sum(dtype=float)/N -print('Error rate: {:.2f}%'.format(ErrorRate*100)) +ErrorRate = (y != y_est).sum(dtype=float) / N +print("Error rate: {:.2f}%".format(ErrorRate * 100)) -# Plot decision boundaries -figure(1); dbprobplot(rf_classifier, X, y, 'auto', resolution=400) -figure(2); dbplot(rf_classifier, X, y, 'auto', resolution=400) +# Plot decision boundaries +figure(1) +dbprobplot(rf_classifier, X, y, "auto", resolution=400) +figure(2) +dbplot(rf_classifier, X, y, "auto", resolution=400) show() -print('Ran Exercise 9.1.3') \ No newline at end of file +print("Ran Exercise 9.1.3") diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_2_1.py b/exercises/02450Toolbox_Python/Scripts/ex9_2_1.py index f323eb78af82c6ac40e73693f7dd6639097fc2a5..8c6fe679b05e39dd43a22bd4b9a50cd5042d4f1a 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_2_1.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_2_1.py @@ -1,18 +1,24 @@ # exercise 9.2.1 +import importlib_resources from matplotlib.pyplot import figure, show -#import numpy as np + +# import numpy as np from scipy.io import loadmat -from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression -from toolbox_02450 import rocplot, confmatplot +from sklearn.model_selection import StratifiedKFold + +from dtuimldmtools import confmatplot, rocplot + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") + # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine2.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] -classNames = [name[0][0] for name in mat_data['classNames']] +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] +classNames = [name[0][0] for name in mat_data["classNames"]] N, M = X.shape C = len(classNames) @@ -20,27 +26,27 @@ C = len(classNames) K = 2 CV = StratifiedKFold(K, shuffle=True) -k=0 -for train_index, test_index in CV.split(X,y): +k = 0 +for train_index, test_index in CV.split(X, y): print(train_index) # extract training and test set for current CV fold - X_train, y_train = X[train_index,:], y[train_index] - X_test, y_test = X[test_index,:], y[test_index] + X_train, y_train = X[train_index, :], y[train_index] + X_test, y_test = X[test_index, :], y[test_index] logit_classifier = LogisticRegression() logit_classifier.fit(X_train, y_train) y_test_est = logit_classifier.predict(X_test).T - p = logit_classifier.predict_proba(X_test)[:,1].T + p = logit_classifier.predict_proba(X_test)[:, 1].T figure(k) rocplot(p, y_test) - figure(k+1) - confmatplot(y_test,y_test_est) + figure(k + 1) + confmatplot(y_test, y_test_est) + + k += 2 - k+=2 - -show() +show() -print('Ran Exercise 9.2.1') \ No newline at end of file +print("Ran Exercise 9.2.1") diff --git a/exercises/02450Toolbox_Python/Scripts/ex9_2_2.py b/exercises/02450Toolbox_Python/Scripts/ex9_2_2.py index 2c4c075432105e307c9b6be1f9d89702cb1f242e..a6e2a331b6b3debc5f1affd77ec5bc681f563973 100644 --- a/exercises/02450Toolbox_Python/Scripts/ex9_2_2.py +++ b/exercises/02450Toolbox_Python/Scripts/ex9_2_2.py @@ -1,21 +1,26 @@ # exercise 9.2.2 +import importlib_resources from matplotlib.pyplot import figure, show -#import numpy as np + +# import numpy as np from scipy.io import loadmat -from sklearn.model_selection import StratifiedKFold from sklearn.linear_model import LogisticRegression -from toolbox_02450 import rocplot, confmatplot +from sklearn.model_selection import StratifiedKFold + +from dtuimldmtools import confmatplot, rocplot + +filename = importlib_resources.files("dtuimldmtools").joinpath("data/wine2.mat") # Load Matlab data file and extract variables of interest -mat_data = loadmat('../Data/wine2.mat') -X = mat_data['X'] -y = mat_data['y'].squeeze() -attributeNames = [name[0] for name in mat_data['attributeNames'][0]] -classNames = [name[0][0] for name in mat_data['classNames']] - -attribute_included = 10 # alcohol contents -X = X[:,attribute_included].reshape(-1,1) +mat_data = loadmat(filename) +X = mat_data["X"] +y = mat_data["y"].squeeze() +attributeNames = [name[0] for name in mat_data["attributeNames"][0]] +classNames = [name[0][0] for name in mat_data["classNames"]] + +attribute_included = 10 # alcohol contents +X = X[:, attribute_included].reshape(-1, 1) attributeNames = attributeNames[attribute_included] N, M = X.shape C = len(classNames) @@ -24,25 +29,25 @@ C = len(classNames) K = 2 CV = StratifiedKFold(K, shuffle=True) -k=0 -for train_index, test_index in CV.split(X,y): +k = 0 +for train_index, test_index in CV.split(X, y): print(train_index) # extract training and test set for current CV fold - X_train, y_train = X[train_index,:], y[train_index] - X_test, y_test = X[test_index,:], y[test_index] + X_train, y_train = X[train_index, :], y[train_index] + X_test, y_test = X[test_index, :], y[test_index] logit_classifier = LogisticRegression() logit_classifier.fit(X_train, y_train) y_test_est = logit_classifier.predict(X_test).T - p = logit_classifier.predict_proba(X_test)[:,1].T + p = logit_classifier.predict_proba(X_test)[:, 1].T figure(k) - rocplot(p,y_test) + rocplot(p, y_test) + + figure(k + 1) + confmatplot(y_test, y_test_est) - figure(k+1) - confmatplot(y_test,y_test_est) + k += 2 - k+=2 - -show() \ No newline at end of file +show()