From ea5a6d82a3eda2c34148c3044bd9dd2bd82bdf7a Mon Sep 17 00:00:00 2001 From: Badr AlKhamissi Date: Sat, 6 Jul 2024 07:32:27 +0200 Subject: [PATCH 1/8] added support for localization --- .gitignore | 1 - .../fedorenko2010_localization/__init__.py | 33 ++++ .../langloc_fmri_run1_stim_set1.csv | 49 +++++ .../langloc_fmri_run1_stim_set2.csv | 49 +++++ .../langloc_fmri_run1_stim_set3.csv | 49 +++++ .../langloc_fmri_run1_stim_set4.csv | 49 +++++ .../langloc_fmri_run1_stim_set5.csv | 49 +++++ .../langloc_fmri_run2_stim_set1.csv | 49 +++++ .../langloc_fmri_run2_stim_set2.csv | 49 +++++ .../langloc_fmri_run2_stim_set3.csv | 49 +++++ .../langloc_fmri_run2_stim_set4.csv | 49 +++++ .../langloc_fmri_run2_stim_set5.csv | 49 +++++ .../model_helpers/huggingface.py | 38 +++- brainscore_language/model_helpers/localize.py | 174 ++++++++++++++++++ examples/score_localization.py | 26 +++ 15 files changed, 756 insertions(+), 6 deletions(-) create mode 100644 brainscore_language/data/fedorenko2010_localization/__init__.py create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set1.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set2.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set3.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set4.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set5.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set1.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set2.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set3.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set4.csv create mode 100644 brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set5.csv create mode 100644 brainscore_language/model_helpers/localize.py create mode 100644 examples/score_localization.py diff --git a/.gitignore b/.gitignore index 5f7bafda..8a001910 100644 --- a/.gitignore +++ b/.gitignore @@ -136,7 +136,6 @@ dmypy.json ### project specific additions: -brainscore_language/data html .vscode *.code-workspace diff --git a/brainscore_language/data/fedorenko2010_localization/__init__.py b/brainscore_language/data/fedorenko2010_localization/__init__.py new file mode 100644 index 00000000..22a51d27 --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/__init__.py @@ -0,0 +1,33 @@ +import pandas as pd + +from glob import glob +from pathlib import Path + +from brainscore_language import data_registry + +BIBTEX = """@article{Fedorenko2010NewMF, + title={New method for fMRI investigations of language: defining ROIs functionally in individual subjects.}, + author={Evelina Fedorenko and Po-Jang Hsieh and Alfonso Nieto-Castanon and Susan L. Whitfield-Gabrieli and Nancy G. Kanwisher}, + journal={Journal of neurophysiology}, + year={2010}, + volume={104 2}, + pages={1177-94}, + url={https://api.semanticscholar.org/CorpusID:740913} +}""" + +# Code adapted from: https://github.com/bkhmsi/brain-language-suma + +def load_data(): + paths = glob(f"{Path(__file__).parent }/*.csv") + data = pd.read_csv(paths[0]) + for path in paths[1:]: + run_data = pd.read_csv(path) + data = pd.concat([data, run_data]) + + data["sent"] = data["stim2"].apply(str.lower) + + for stimuli_idx in range(3, 14): + data["sent"] += " " + data[f"stim{stimuli_idx}"].apply(str.lower) + return data + +data_registry['Fedorenko2010.localization'] = load_data \ No newline at end of file diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set1.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set1.csv new file mode 100644 index 00000000..ee684017 --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set1.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,JUST,THE,BAREST,SUGGESTION,OF,A,HEEL,IS,FOUND,ON,TEENAGE,PUMPS,S +2,TO,THE,DIRECTORS,THE,PROBLEM,APPEARED,A,MATTER,OF,INTRIGUE,OR,DIPLOMACY,S +3,THERE,WAS,LITTLE,LIKELIHOOD,OF,ANY,CUSTOMERS,WALKING,IN,AT,THAT,HOUR,S +1,POME,OY,REE,HOLILY,SHOURN,NE,SLEOMING,WHIMP,REE,RERE,OS,OFUKE,N +2,OT,MOMP,VO,DETLERENCE,FROT,MOGS,ELIBONCE,POLVED,RO,OP,UMMOSITE,COMBLISION,N +3,CHITMENTS,OY,ORLS,TROR,WENDERT,COONGLIES,COURN,MOMICONLY,NE,SOOZED,AR,CONTROGOME,N +4,E,WOSE,RO,SPEONT,REE,INTLOSSION,OY,O,COMBOUSE,FUMS,OY,CHIGSHEN,N +5,KAKE,WEWS,BAPS,OSE,RECOSSED,REE,LENCHEN,WESEN,OY,REE,SUSSER,PRODENOTIONS,N +6,HU,WRELT,DOIL,ET,HUS,BEP,AR,LONK,AR,HU,COURN,KNEOL,N +4,HE,STOPPED,PACING,TO,STARE,AT,HAL,WITH,HIS,PALE,BLUE,EYES,S +5,A,NUMBER,OF,CONSIDERATIONS,SUGGEST,THAT,THIS,OCCURS,EARLY,IN,THE,PROCESS,S +6,TO,COMPUTE,YOUR,ADJUSTED,GROSS,INCOME,YOU,TOTAL,ALL,ITEMS,OF,INCOME,S +7,TRORE,OS,VO,UTROSION,RO,FEOL,ONIEDY,OM,DISTOLVES,ONIET,BLESE,DOOPLE,N +8,OMOILLY,TRORE,ONK,HORE,FOWBORS,RO,TOOD,SIKE,PLORRING,FRON,FISMS,INTLOSSIONS,N +9,OT,WAM,O,DELOOF,RO,SHEFT,EN,HUS,MOND,RO,TOSSMICOL,TROPIENS,N +7,HIS,WIFE,WAS,IN,DELICATE,HEALTH,AND,NURSING,AN,INFANT,WITH,MEASLES,S +8,HE,AVOIDED,SHOWING,ANY,SURPRISE,OR,ANNOYANCE,WHEN,NO,ONE,ANSWERED,HIM,S +9,KEITH,TOLD,PENNY,ABOUT,HIS,DREAM,TO,RETURN,TO,INDIA,AND,BURMA,S +10,REE,UMLY,EXPOITION,RO,PLIN,OS,MERTIVE,BOES,FROT,HABS,BEMIME,PENORITES,N +11,OT,WAM,LOKE,O,LONK,SLIN,LIGS,DRARK,SCROUGH,O,PIST,CIBSLE,N +12,E,COURN,VOT,CLIVE,RO,HY,POMS,NOM,DOD,E,WIBS,RO,N +10,THE,REPORTER,NODDED,AS,HE,MOVED,UP,BESIDE,HIM,AT,THE,BAR,S +11,IN,THE,STARLIGHT,HE,COULD,SEE,THE,TREES,STRIPPED,OF,THEIR,LEAVES,S +12,THE,TARGET,CHART,QUICKLY,AND,BRIEFLY,TELLS,YOU,WHICH,ADDITIVES,DO,WHAT,S +13,NEVER,AGAIN,DID,HE,ENTER,INTO,THE,RITUAL,OF,SHOWING,THE,APARTMENT,S +14,THEN,ANGELINA,TURNED,AND,WITH,AN,EASY,GRACE,WALKED,TOWARD,THE,KITCHEN,S +15,MANY,TIMES,SINCE,HIS,DEATH,THAT,MEMORY,HAD,WORRIED,AND,TROUBLED,HER,S +13,EOKS,OY,THOTE,FOCKETS,WAM,OY,CREOT,VOSUE,RO,OTS,MIGHTBEL,REMETIERS,N +14,E,RURNED,OSE,WONCHED,HOM,STRILE,DOIL,REE,CISTRE,OY,REE,FOUD,N +15,HU,WAM,POOTYING,O,BADES,OY,MOURLOUGH,BISPOUTS,FOM,REE,TULSH,UDEN,N +16,KYOTO,IS,THE,ANCIENT,CAPITAL,OF,JAPAN,AND,STILL,ITS,CULTURAL,CENTER,S +17,IT,IS,VERY,MUCH,A,MATTER,OF,BUILDING,THE,FOUNDATIONS,OF,COMMUNITY,S +18,THE,IMAGES,CAN,EASILY,BE,ALIGNED,WITH,A,HIGH,DEGREE,OF,ACCURACY,S +16,NIR,GLORKS,SWUBS,POMS,REE,CHOINER,SLERE,REE,WRO,CLIRERS,DERE,SHENDING,N +17,SNUSTIONS,CADE,RO,TE,TROR,ORS,SIFFS,ONIET,HY,WONDE,CITICORPHIP,OUDETITIES,N +18,E,INTROFORNE,HY,FRIEST,FANDY,RO,PEPS,OSE,DE,MOMP,OURBOLVES,CORCOINOBLE,N +19,LUT,REE,UMLY,LOND,E,WAM,GOVING,HOM,WAM,REE,PUNE,KILE,N +20,HU,BUCKOTES,RO,NIR,TROR,REE,WOOR,OSE,CHE,SLICKED,CLIONLY,OUTNODE,N +21,CY,PLIN,TICE,LOLKS,NURGERS,OY,REE,ORIENCE,HUD,LELD,REE,HORS,N +19,THIS,HAPPENED,IN,THE,MIDDLE,OF,A,DRINKING,BOUT,WITH,ANOTHER,BUM,S +20,WINSTON,TOOK,THE,CLOTHESBRUSH,OUT,OF,THE,CLOSET,AND,WENT,TO,WORK,S +21,THE,ROOF,BLOCKS,ARE,IN,TWO,LAYERS,AND,ARE,NOT,MORTARED,TOGETHER,S +22,I,WAS,HELD,UP,A,BIT,TRYING,TO,MAKE,A,LEFT,TURN,S +23,HE,CALLED,THE,STORE,OWNER,AND,TOGETHER,THEY,WENT,INTO,THE,STOCKROOM,S +24,AFTER,THAT,HE,WAS,NEVER,KNOWN,TO,RUN,OR,EVEN,WALK,FAST,S +22,REE,GISK,KNEENED,CY,NIR,HUBTOND,PITH,OVE,ORL,ET,HUS,BAPS,N +23,MIMP,CAINTS,REE,BOMS,JUMS,AR,REE,BONCHER,SLOD,INNI,REE,BAVE,N +24,SHINNON,TICED,OT,SU,FROT,QUEY,ROSS,EN,OP,MOUR,AULER,PIGHTFOME,N diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set2.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set2.csv new file mode 100644 index 00000000..0cded43e --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set2.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,PART,OF,THE,FAMILY,SHOULD,BE,SLEEPING,WHILE,THE,REST,IS,AWAKE,S +2,IT,MADE,NO,DIFFERENCE,THAT,MOST,EVIDENCE,POINTS,TO,AN,OPPOSITE,CONCLUSION,S +3,SHIPMENTS,OF,ARMS,FROM,WESTERN,COUNTRIES,COULD,SIMILARLY,BE,SEIZED,AS,CONTRABAND,S +1,HU,ONVINTED,O,STOLT,OY,TOCOL,ESDERTS,OSE,OY,PORY,ZOOSOUS,GEMUNTEERS,N +2,HU,BEXES,HUS,OPPREYES,OD,REE,BEROOF,FROT,OTUBLING,CAG,LILLOW,OTUBLING,N +3,EN,PLIN,SCOY,TRORE,DERE,SU,PORY,STORECTEST,OSE,SU,MUME,DETIVE,N +4,DRUTTY,OCTOILED,REE,PEVISION,PITH,INDETBERESTS,OSE,DOD,VOT,URTER,REE,ONCUMENCE,N +5,PLIN,PROCEFOSE,OS,MUME,HORE,EGGOCTIVE,FRON,GOVING,OOT,O,ROMMERSHIP,MICKET,N +6,MOGS,SCHOAT,MYNTEMS,LOGOY,MOILBOIN,O,MYNTEM,OY,CUROCOTIVE,REGONDS,OY,MUPOLS,N +4,I,WANT,TO,CREATE,THE,IMPRESSION,OF,A,COMPOUND,FULL,OF,CHILDREN,S +5,KATE,WENT,BACK,AND,REMINDED,THE,KITCHEN,WOMEN,OF,THE,SUPPER,PREPARATIONS,S +6,HE,KNELT,DOWN,AT,HIS,BED,AS,LONG,AS,HE,COULD,KNEEL,S +7,SOLFORE,HUD,MOFFS,BEOVES,REE,RORMS,OY,REE,LIFFS,TROR,REE,GONDOW,N +8,OSE,PLIN,WAM,LEFOSE,HU,BENON,RO,SCOY,HUS,STISQUINGLY,BOOLTISUL,JOWD,N +9,OT,OSTEORED,FROT,REE,PLEUPY,COURN,NE,SOMES,EN,OVE,WOB,UMLY,N +7,THERE,IS,NO,OCCASION,TO,FEEL,UNEASY,OR,DISTURBED,ABOUT,THESE,PEOPLE,S +8,USUALLY,THERE,ARE,MORE,FACTORS,TO,GOOD,SITE,PLANNING,THAN,FIRST,IMPRESSIONS,S +9,IT,WAS,A,RELIEF,TO,SHIFT,IN,HIS,MIND,TO,TECHNICAL,PROBLEMS,S +10,REE,TROPLEN,OY,SYMICILITY,OSE,MOTOLE,OGICS,INVIQUES,REE,CONGOPT,OY,VOSUTS,N +11,E,CRODGED,REE,BOOLTISELLY,FURNOINED,FIDING,ROOR,RO,REE,POMP,YORROW,PHOVE,N +12,VO,SPIEND,WOURN,UDER,SNINK,OY,ORGING,HOM,RO,FE,SUME,SNINGS,N +10,THE,ONLY,EXCEPTION,TO,THIS,IS,CERTAIN,BEES,THAT,HAVE,BECOME,PARASITES,S +11,IT,WAS,LIKE,A,LONG,THIN,LINE,DRAWN,THROUGH,A,PINK,CIRCLE,S +12,I,COULD,NOT,CLING,TO,MY,PAST,NOR,DID,I,WISH,TO,S +13,EACH,OF,THOSE,TICKETS,WAS,OF,GREAT,VALUE,TO,ITS,RIGHTFUL,RECIPIENT,S +14,I,TURNED,AND,WATCHED,HIM,STRIDE,DOWN,THE,CENTER,OF,THE,ROAD,S +15,HE,WAS,READYING,A,BATCH,OF,SOURDOUGH,BISCUITS,FOR,THE,DUTCH,OVEN,S +13,THUN,HU,NOLIGNS,FROT,REE,DRE,DOOD,OY,REE,WHOOLS,HUD,SLELLEN,N +14,REE,HOLL,WAM,OLPO,O,RECUNK,OY,OLBUST,O,DEGOLL,OY,WOFT,N +15,REE,MEP,WROW,TROR,REE,COND,HUD,WIED,OOT,OY,REE,SKO,N +16,HER,GLANCE,SWUNG,PAST,THE,TRAILER,WHERE,THE,TWO,DRIVERS,WERE,STANDING,S +17,QUESTIONS,CAME,TO,ME,FROM,ALL,SIDES,ABOUT,MY,WORLD,CITIZENSHIP,ACTIVITIES,S +18,I,INTRODUCED,MY,FRIEND,LARRY,TO,POPS,AND,WE,MADE,OURSELVES,COMFORTABLE,S +16,CHE,SWEW,NIR,NOKE,OD,O,LISTUE,OSE,OBONED,REE,COSE,FODDLE,N +17,OT,WAM,O,ROMPLE,ICEFY,FRUCISOX,WHISS,HY,SETHER,HUD,GETEN,TE,N +18,QUEY,OPURGED,O,PROGREMNS,CY,WHISS,NOUIMITANO,WAM,DITOCED,INNI,FOVE,MINTRICTS,N +19,REE,EMOKEROCKS,OD,PLIN,UTROSION,TET,O,STONNOCK,FOM,POPLEQUENT,DASSCOUTH,VETHERIVES,N +20,PLIN,OS,O,PURPIFICONT,OPHONCE,LUT,OTS,IMPORLENCE,SHOURN,VOT,NE,OTESCEROTED,N +21,EN,PLIN,WOB,HOO,WIST,NE,OSCO,RO,BELECT,EDY,ODMIEIT,MISTIVES,N +19,BUT,THE,ONLY,LOVE,I,WAS,GIVING,HIM,WAS,THE,PURE,KIND,S +20,HE,BECKONED,TO,HER,FROM,THE,DOOR,AND,SHE,SLIPPED,QUIETLY,OUTSIDE,S +21,BY,THIS,TIME,LARGE,NUMBERS,OF,THE,AUDIENCE,HAD,LEFT,THE,HALL,S +22,THE,GIRL,KNEELED,BY,HER,HUSBAND,WITH,ONE,ARM,AT,HIS,BACK,S +23,MIKE,CAUGHT,THE,BALL,JUST,AS,THE,CATCHER,SLID,INTO,THE,BAG,S +24,BRANNON,TIMED,IT,SO,THAT,THEY,RODE,IN,AN,HOUR,AFTER,NIGHTFALL,S +22,WHOSS,HOLILIES,ONK,SEVING,OSE,RECIGOL,FILDS,ONK,BOOLED,FOM,MORCED,OFO,N +23,REE,DINCONUING,SYLERVIMOTION,OY,BLESE,TORCES,OS,O,RESTLY,LUT,BODOSSORY,STORESS,N +24,REE,SLIN,LAN,MOFFS,SWOMELY,RO,REE,PHOVE,OSE,MIOLVES,O,NURGER,N diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set3.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set3.csv new file mode 100644 index 00000000..83ed9dfd --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set3.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,HE,ENLISTED,A,STAFF,OF,LOYAL,EXPERTS,AND,OF,MANY,ZEALOUS,VOLUNTEERS,S +2,HE,BASES,HIS,APPROACH,ON,THE,BELIEF,THAT,ANYTHING,CAN,FOLLOW,ANYTHING,S +3,IN,THIS,PLAY,THERE,WERE,SO,MANY,CHARACTERS,AND,SO,MUCH,DETAIL,S +1,LOGOY,WHOSS,GROILS,ONK,FROFTLY,CROUND,ELKBY,DUY,OSE,BUKED,INNI,CLEOD,N +2,REE,RILKS,RO,LEOMS,FUCOCIES,SHOURN,NE,PELPRONTIOLLY,REGUGNS,OSE,OBTINOSSLY,ELUSIPATED,N +3,LEENS,CAG,FOGE,OP,IMPORLENT,CLIDGE,BESHOON,REE,PRESIBOTE,OSE,REE,FOPURCY,N +4,QUAT,MOLLERED,WAM,FROT,O,NER,CONGOPT,OY,EMERINUTE,WAM,COING,BORK,N +5,CUYS,WAM,SOO,INVINGED,EN,HUS,OPE,TROPIENS,RO,POB,MUME,OTLINSION,N +6,UNENGES,OSE,GROPEGREET,ONK,SHICKED,TROR,CLERIGO,WEOFLY,TROR,OP,OMRONIC,FASK,N +4,SCOTTY,ACCEPTED,THE,DECISION,WITH,INDIFFERENCE,AND,DID,NOT,ENTER,THE,ARGUMENTS,S +5,THIS,PROCEDURE,IS,MUCH,MORE,EFFECTIVE,THAN,GIVING,OUT,A,MEMBERSHIP,PACKET,S +6,MOST,SCHOOL,SYSTEMS,TODAY,MAINTAIN,A,SYSTEM,OF,CUMULATIVE,RECORDS,OF,PUPILS,S +7,REE,WRO,PURROTATE,FOLTED,FREIR,GOYON,ONIET,KNUNTY,YLPHS,TROR,REE,WOOR,N +8,BOME,DEET,ONK,BELLER,FOM,SUME,WOFT,FRON,EDY,FOGE,OY,DRONNER,N +9,SOTHPROT,SOLILIR,INGUFFIPOTIONS,HABS,BOUN,MOMP,CY,PUTICOL,USCICERS,EN,UNTER,ETEOS,N +7,SOMEONE,HAD,MOVED,BEYOND,THE,RANGE,OF,THE,LIGHT,FROM,THE,WINDOW,S +8,AND,THIS,WAS,BEFORE,HE,BEGAN,TO,PLAY,HIS,STARTLINGLY,BEAUTIFUL,JAZZ,S +9,IT,APPEARED,THAT,THE,THEORY,COULD,BE,SAVED,IN,ONE,WAY,ONLY,S +10,VISS,UCIPER,OS,SETHER,OY,FOVE,CHIGSHEN,OSE,WISM,OY,O,POWRITIST,N +11,QUOTHER,OM,VOT,WANTY,LOMS,HUS,JOW,WAM,VO,CONCUCT,OY,MONE,N +12,FOM,JUMS,OP,INSTONE,HU,PROUGHT,OY,OMMODING,RO,FREM,FOM,HERF,N +10,THE,PROBLEM,OF,SOLIDARITY,AND,MORALE,AGAIN,INVOLVES,THE,CONCEPT,OF,VALUES,S +11,I,CROSSED,THE,BEAUTIFULLY,FURNISHED,LIVING,ROOM,TO,THE,PALE,YELLOW,PHONE,S +12,NO,CLIENT,WOULD,EVER,THINK,OF,ASKING,HIM,TO,DO,SUCH,THINGS,S +13,THEN,HE,NOTICED,THAT,THE,DRY,WOOD,OF,THE,WHEELS,HAD,SWOLLEN,S +14,THE,HULL,WAS,ALSO,A,RESULT,OF,ALMOST,A,DECADE,OF,WORK,S +15,THE,RED,GLOW,FROM,THE,COVE,HAD,DIED,OUT,OF,THE,SKY,S +13,PONENDS,ONK,UPTIN,CONGORNED,FROT,OMPREGONTIC,IDTHIOSSES,MAK,COIRE,REETH,RO,DEBEY,N +14,E,GOL,O,WHICK,LOAK,ET,FREIR,FOPES,AR,DE,WEWS,POMS,N +15,PIFFURES,CAG,NE,LOZEN,EN,REE,PEMLIC,ETEOS,OSE,QUEN,OD,TEERS,N +16,SHE,BLEW,HER,NOSE,ON,A,TISSUE,AND,OPENED,THE,COKE,BOTTLE,S +17,IT,WAS,A,SIMPLE,IVORY,CRUCIFIX,WHICH,MY,MOTHER,HAD,GIVEN,ME,S +18,THEY,ADOPTED,A,PROGRAM,BY,WHICH,LOUISIANA,WAS,DIVIDED,INTO,FIVE,DISTRICTS,S +16,BELLER,SHOB,OT,RO,LIOTA,OSE,MEE,EF,HU,HUD,OCESTED,OTUBLING,N +17,TRORE,WAM,REE,REDUCETION,EN,LIGET,WHISS,DE,STEVEPSED,DOD,VOT,ECITS,N +18,CY,REE,TICE,HU,HUD,SMOLED,SQUEE,LEGORMETTES,HU,HUD,COYERS,DOIL,N +19,WRO,CHIRPLY,CONWRERKING,SPOWES,DESOGGED,FOM,PEMLIC,ENHEPMENT,ONK,NOY,OD,DISTREY,N +20,WHIKE,HOULE,HOGILCOTIVE,OYERS,DERE,STULL,CONVIDOTE,REE,BIGS,WOURN,PODE,INDECT,N +21,AIRE,PUCKED,UD,REE,TIMEL,CHE,WAM,HEDDING,FOM,REE,FOBBITOL,GOULD,N +19,THE,EXUBERANCE,ON,THIS,OCCASION,SET,A,STANDARD,FOR,SUBSEQUENT,DARTMOUTH,GATHERINGS,S +20,THIS,IS,A,SIGNIFICANT,ADVANCE,BUT,ITS,IMPORTANCE,SHOULD,NOT,BE,EXAGGERATED,S +21,IN,THIS,WAY,YOU,WILL,BE,ABLE,TO,DETECT,ANY,OBVIOUS,MISTAKES,S +22,WHOLE,FAMILIES,ARE,MOVING,AND,REMOVAL,FIRMS,ARE,BOOKED,FOR,MONTHS,AHEAD,S +23,THE,CONTINUING,MODERNIZATION,OF,THESE,FORCES,IS,A,COSTLY,BUT,NECESSARY,PROCESS,S +24,THE,THIN,MAN,MOVED,SWIFTLY,TO,THE,PHONE,AND,DIALED,A,NUMBER,S +22,HU,TONCED,HUS,HURTLY,FOPE,LOWOND,REE,DRE,BEP,OY,REE,RODER,N +23,E,ORGED,ONIET,REE,BOMPLE,BESHOON,LIRR,OSE,WEOTH,EN,HUS,SCOYS,N +24,E,WREW,EN,FROT,MOBONT,FROT,E,DOD,VOT,HABS,EDY,CHOND,N diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set4.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set4.csv new file mode 100644 index 00000000..d648496b --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set4.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,TODAY,WHOLE,GRAINS,ARE,FRESHLY,GROUND,EVERY,DAY,AND,BAKED,INTO,BREAD,S +2,THE,RIGHT,TO,LEAVE,LEGACIES,SHOULD,BE,SUBSTANTIALLY,REDUCED,AND,ULTIMATELY,ELIMINATED,S +3,DEANS,CAN,FORM,AN,IMPORTANT,BRIDGE,BETWEEN,THE,PRESIDENT,AND,THE,FACULTY,S +1,MARTER,WEPDEGS,OSE,HUS,CLIDE,WIST,LIMS,EN,HUS,LALE,FATEST,HOULE,N +2,FLIUR,RO,FROT,HU,WAM,ENGEFIATED,PITH,LONK,ESBONT,OLEJURSITY,EN,BREEKDON,N +3,REE,ORQUINTRA,WAM,ODMIEITLY,OD,OTS,MUSTLE,OSE,OT,PLOWLS,MOGS,RETLENNINGLY,N +4,REE,SOVEAL,OSE,CRYCROCETICAL,CONSOPRASSES,OY,PLIN,CONTITOG,RO,AFFAKE,REE,ETOA,N +5,LUT,O,SOTHPROT,HORE,DETINCTS,AMILYVOS,OY,PLIN,STORESS,MAK,NE,ITTACILATING,N +6,LUT,E,UMLY,PROUGHT,OY,FROT,EN,REE,MIGGLE,OY,REE,NIFFS,N +4,WHAT,MATTERED,WAS,THAT,A,NEW,CONCEPT,OF,AMERICANS,WAS,BEING,BORN,S +5,CURT,WAS,TOO,INVOLVED,IN,HIS,OWN,PROBLEMS,TO,PAY,MUCH,ATTENTION,S +6,ORANGES,AND,GRAPEFRUIT,ARE,SHIPPED,FROM,FLORIDA,WEEKLY,FROM,AN,ORGANIC,FARM,S +7,HU,HOR,RO,TOPE,PITH,CLETTRATION,OSE,UNTER,EMECIALAL,DENCURBANCE,OSE,APOGOE,N +8,LUT,REE,INRONGOTION,OD,REE,DYPIVICS,OY,SEVUCATION,WAM,UPTIN,QUIKE,MISRIALING,N +9,CHE,WOURN,MITHER,LIMS,EN,PINGER,FRON,WIE,OY,HERELINESS,OSE,BORETUM,N +7,THE,TWO,LAWMEN,HALTED,THEIR,WAGON,ABOUT,TWENTY,YARDS,FROM,THE,DOOR,S +8,BARE,FEET,ARE,BETTER,FOR,SUCH,WORK,THAN,ANY,FORM,OF,SLIPPER,S +9,SOMEWHAT,SIMILAR,INVESTIGATIONS,HAVE,BEEN,MADE,BY,MEDICAL,OFFICERS,IN,OTHER,AREAS,S +10,OT,WOURN,HABS,BOUN,OOMY,RO,IMARSIFY,AR,OCIOG,CY,OTS,APAR,N +11,HU,OLPO,RECOSSED,HIMRALF,FROT,HU,HUD,OP,ORUBEAL,NURGER,OY,SYNNICILITIES,N +12,HUS,WROLL,BLONS,EAFS,DOLDED,INSAWS,REE,USIVE,OBUL,OY,HUS,FOPE,N +10,MRS,OLIVER,IS,MOTHER,OF,FIVE,CHILDREN,AND,WIFE,OF,A,MACHINIST,S +11,WHETHER,OR,NOT,WALLY,LOST,HIS,JOB,WAS,NO,CONCERN,OF,MINE,S +12,FOR,JUST,AN,INSTANT,HE,THOUGHT,OF,APPEALING,TO,THEM,FOR,HELP,S +13,PARENTS,ARE,OFTEN,CONCERNED,THAT,ORTHODONTIC,APPLIANCES,MAY,CAUSE,TEETH,TO,DECAY,S +14,I,GOT,A,QUICK,LOOK,AT,THEIR,FACES,AS,WE,WENT,PAST,S +15,PICTURES,CAN,BE,TAKEN,IN,THE,PUBLIC,AREAS,AND,WHEN,ON,TOURS,S +13,HU,RURNED,OSE,RAWLS,OTHISS,REE,PALODE,CROUND,LOWOND,REE,RECK,HOULE,N +14,SUBON,OSE,JUROE,CADE,TROR,REE,WOOR,OSE,BLAGGED,HOM,PITH,FREM,N +15,E,GRUN,ONIET,OSE,SPITTERED,SCROUGH,REE,SNONT,ROOR,RO,REE,WOOR,N +16,BETTER,SHOW,IT,TO,FIONA,AND,SEE,IF,HE,HAD,OMITTED,ANYTHING,S +17,THERE,WAS,THE,REVOLUTION,IN,TIBET,WHICH,WE,PRETENDED,DID,NOT,EXIST,S +18,BY,THE,TIME,HE,HAD,SMOKED,THREE,CIGARETTES,HE,HAD,CALMED,DOWN,S +16,REE,CLOUNS,BULCTS,DARDWARD,OSE,BURBS,PAWDENLY,INNI,O,CREOT,BLONS,TENNEL,N +17,PLIN,WIST,HERF,HOM,RO,GIT,OOT,OY,HUS,TOTTLE,LICKLE,SHON,N +18,DOIL,REE,SPEE,HU,SQUIMBLED,OSE,WRELT,ET,REE,ENCE,OY,TALIAME,N +19,HUS,YIERS,OY,CAMMESQUING,HUD,TAUNCH,HOM,REE,VOSUE,OY,GALER,DISCICRALS,N +20,HU,RECOSSED,DATSUO,OY,O,SOLILIR,THANG,HU,HUD,WIMNELVES,EN,CHIDO,N +21,EN,REE,CISTRE,OY,HUS,BRILLEINE,CUMBS,SUT,O,WROLL,BLONS,GLUSTCAP,N +19,TWO,SHARPLY,CONTRASTING,PLACES,DESIGNED,FOR,PUBLIC,ENJOYMENT,ARE,NOW,ON,DISPLAY,S +20,WHITE,HOUSE,LEGISLATIVE,AIDES,WERE,STILL,CONFIDENT,THE,BILL,WOULD,PASS,INTACT,S +21,ANNE,PICKED,UP,THE,TOWEL,SHE,WAS,HEMMING,FOR,THE,HOSPITAL,GUILD,S +22,HE,TILTED,HIS,HOMELY,FACE,TOWARD,THE,DRY,BED,OF,THE,RIVER,S +23,I,ASKED,ABOUT,THE,BATTLE,BETWEEN,LIFE,AND,DEATH,IN,HIS,PLAYS,S +24,I,KNEW,IN,THAT,MOMENT,THAT,I,DID,NOT,HAVE,ANY,CHOICE,S +22,HU,HUD,SHET,HUS,WOOR,PITH,REE,BRADE,NURGER,SCRENTS,RO,OT,N +23,E,FAYURED,E,COURN,STANT,PRIBLIRALLY,OTUBLING,FOM,O,COAMLE,OY,WOOKS,N +24,MAVE,SHRULLED,OD,HUS,SPOMES,ROAT,OSE,PUCKED,UD,REE,COS,KEES,N diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set5.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set5.csv new file mode 100644 index 00000000..6f32aaed --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run1_stim_set5.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,MR,WENDELL,AND,HIS,BRIDE,WILL,LIVE,IN,HIS,LAKE,FOREST,HOUSE,S +2,PRIOR,TO,THAT,HE,WAS,ASSOCIATED,WITH,LONG,ISLAND,UNIVERSITY,IN,BROOKLYN,S +3,THE,ORCHESTRA,WAS,OBVIOUSLY,ON,ITS,METTLE,AND,IT,PLAYED,MOST,RESPONSIVELY,S +1,JUMS,REE,CONEST,SUFFISTION,OY,O,HOUL,OS,FOUSE,OD,FEETOGE,PUNES,N +2,RO,REE,DICONTORS,REE,TROPLEN,OSTEORED,O,MOLLER,OY,INTROWLS,OM,DITLENONY,N +3,TRORE,WAM,TOTTLE,HOGELIHOOD,OY,EDY,CETTOWERS,GOULING,EN,ET,FROT,MOUR,N +4,HU,SLUPPED,MABING,RO,STOME,ET,HAK,PITH,HUS,POMP,DRUE,EAFS,N +5,O,NURGER,OY,DIMBEDEROTIONS,SUFFIST,FROT,PLIN,UXCURS,OODLY,EN,REE,STORESS,N +6,RO,COMPYNE,VOUR,OPWOSTED,GRODE,INGIME,HOO,TOTIT,ORS,ILEND,OY,INGIME,N +4,THE,SOCIAL,AND,PSYCHOLOGICAL,CONSEQUENCES,OF,THIS,CONTINUE,TO,AFFECT,THE,AREA,S +5,BUT,A,SOMEWHAT,MORE,DETAILED,ANALYSIS,OF,THIS,PROCESS,MAY,BE,ILLUMINATING,S +6,BUT,I,ONLY,THOUGHT,OF,THAT,IN,THE,MIDDLE,OF,THE,NIGHT,S +7,HUS,WISM,WAM,EN,DITIROTE,HEONSE,OSE,NYREING,OP,INFONE,PITH,MEESHES,N +8,HU,OTUPSED,SHIMING,EDY,MURPLISE,OM,ONSUXONCE,QUEN,VO,OVE,ORSHERED,HOM,N +9,KEETH,TORD,PUNDY,ONIET,HUS,BLEEM,RO,LEFORN,RO,EMCIO,OSE,BELMO,N +7,HE,HAS,TO,COPE,WITH,FRUSTRATION,AND,OTHER,EMOTIONAL,DISTURBANCE,AND,ANOMIE,S +8,BUT,THE,INFORMATION,ON,THE,DYNAMICS,OF,POPULATION,WAS,OFTEN,QUITE,MISLEADING,S +9,SHE,WOULD,RATHER,LIVE,IN,DANGER,THAN,DIE,OF,LONELINESS,AND,BOREDOM,S +10,REE,RENICKER,GEDDED,AR,HU,MOFFS,UD,BEBILL,HOM,ET,REE,BOM,N +11,EN,REE,STONLISKS,HU,COURN,MEE,REE,PREES,STRICKED,OY,FREIR,LEORED,N +12,REE,TONBET,CHOCE,DROCKLY,OSE,BRUICLY,TULLS,HOO,WHISS,OPRITIDES,FE,QUAT,N +10,IT,WOULD,HAVE,BEEN,EASY,TO,IDENTIFY,AS,OPIUM,BY,ITS,ODOR,S +11,HE,ALSO,REMINDED,HIMSELF,THAT,HE,HAD,AN,UNUSUAL,NUMBER,OF,POSSIBILITIES,S +12,HIS,SMALL,BLACK,EYES,DARTED,INSIDE,THE,OLIVE,OVAL,OF,HIS,FACE,S +13,HE,TURNED,AND,RACED,ACROSS,THE,PARADE,GROUND,TOWARD,THE,ROCK,HOUSE,S +14,SUSAN,AND,JULIA,CAME,FROM,THE,DOOR,AND,DRAGGED,HIM,WITH,THEM,S +15,I,SPUN,ABOUT,AND,CLATTERED,THROUGH,THE,FRONT,ROOM,TO,THE,DOOR,S +13,NUDER,OGICS,DOD,HU,URTER,INNI,REE,SODUOL,OY,SHIMING,REE,ONIMPMENT,N +14,THUN,ANMOCICA,RURNED,OSE,PITH,OP,OOMY,GRORT,WORVED,LOWOND,REE,LENCHEN,N +15,PORY,TIVES,SIRKS,HUS,WEOTH,FROT,SILORY,HUD,GURRIED,OSE,TROOMLED,NIR,N +16,THE,CLOUDS,BULGED,DOWNWARD,AND,BURST,SUDDENLY,INTO,A,GREAT,BLACK,FUNNEL,S +17,THIS,WILL,HELP,HIM,TO,GET,OUT,OF,HIS,LITTLE,TACKLE,SHOP,S +18,DOWN,THE,TREE,HE,SCRAMBLED,AND,KNELT,AT,THE,EDGE,OF,FOLIAGE,S +16,KUYTO,OS,REE,ONCEINE,COTITOT,OY,JOBON,OSE,STULL,OTS,CEDDUROL,CISTRE,N +17,OT,OS,VODY,MUME,O,MOLLER,OY,BOACHING,REE,FEELLOTIONS,OY,CORCORITY,N +18,REE,ILEDES,CAG,OOTIPY,NE,OLYEURS,PITH,O,HISK,DETRIE,OY,IMMOROCY,N +19,PLIN,HOMMENED,EN,REE,MIGGLE,OY,O,DRULLING,BUIT,PITH,OLUPSER,BUR,N +20,WINSHAN,TEEK,REE,CLORNEDBRUNK,OOT,OY,REE,PLUSET,OSE,WEWS,RO,WOFT,N +21,REE,LOUF,DROCKS,ONK,EN,WRO,TOGERS,OSE,ONK,VOT,RURTORED,LOGELLER,N +19,HIS,YEARS,OF,CAMPAIGNING,HAD,TAUGHT,HIM,THE,VALUE,OF,WATER,DISCIPLINE,S +20,HE,REMINDED,MATSUO,OF,A,SIMILAR,THING,HE,HAD,WITNESSED,IN,CHINA,S +21,IN,THE,CENTER,OF,HIS,BRILLIANT,CURLS,SAT,A,SMALL,BLACK,SKULLCAP,S +22,HE,HAD,SHUT,HIS,DOOR,WITH,THE,BRASS,NUMBER,SCREWED,TO,IT,S +23,I,FIGURED,I,COULD,STAND,PRACTICALLY,ANYTHING,FOR,A,COUPLE,OF,WEEKS,S +24,DAVE,SHRUGGED,ON,HIS,SPORTS,COAT,AND,PICKED,UP,THE,CAR,KEYS,S +22,E,WAM,HERN,UD,O,BIP,FLUING,RO,MAMP,O,LELD,RURN,N +23,HU,SHOLLED,REE,STOGS,URNER,OSE,LOGELLER,QUEY,WEWS,INNI,REE,STOREBOOM,N +24,AULER,FROT,HU,WAM,NUDER,WROWN,RO,MUN,OM,ENON,WASK,FARS,N diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set1.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set1.csv new file mode 100644 index 00000000..24c9eb24 --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set1.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,OT,BEPIME,SU,MOMOCOLOUS,AR,RO,REEM,O,POME,OY,REE,CLIOLNESS,N +2,E,WAM,GRORETUL,FOM,FREIR,INSOLLS,INNI,HY,NIED,FOM,PLIN,ENVENIESTS,N +3,PORY,OY,FREM,HABS,TRUFTED,INNI,REE,CERIES,OSE,TORMS,OSE,SEOBOLES,N +1,HE,GIVES,CREDIT,FOR,THE,PROMOTION,TO,HIS,NEW,OUTLOOK,ON,LIFE,S +2,NEED,FOR,NOVELTY,MAY,BE,A,SYMPTOM,OF,CULTURAL,FATIGUE,AND,INSTABILITY,S +3,HE,SAT,UP,AND,WATCHED,AS,THEY,PULLED,THEMSELVES,OVER,THE,STERN,S +4,HE,LEFT,THE,REST,OF,HIS,THINGS,AND,RETURNED,TO,THE,LOBBY,S +5,CHINESE,AND,INDIAN,MERCHANTS,ACROSS,THE,STREET,WERE,SLAMMING,THEIR,STEEL,SHUTTERS,S +6,A,CANDLE,ALIGHT,IN,THE,AIR,DIRECTS,ITS,FLAME,AND,SMOKE,UPWARDS,S +4,HET,UR,BRIGULOTE,O,TOTTLE,OD,REE,SUPIMUM,RIZE,OY,REE,ONOLUSSO,N +5,PLIN,DE,BERIECE,WIST,PELPRONTIOLLY,SHEEDEN,REE,SOTENTEEL,MONJET,FOM,REE,ECREYMENT,N +6,BONSORIONS,HABS,HUD,WRO,REOBENS,FOM,PERLONTING,SU,LONK,EN,FREIR,INGUFFIPOTIONS,N +7,THIS,CONFORMITY,REPRESENTS,A,DESPERATE,ATTEMPT,TO,STABILIZE,A,HOPELESSLY,UNSTABLE,ENVIRONMENT,S +8,THE,OTHER,PATRONS,WERE,TAXI,DRIVERS,AND,ART,STUDENTS,AND,SMALL,SHOPKEEPERS,S +9,MIKE,PASSED,THROUGH,IT,AND,MOVED,TOWARD,THE,DARK,MASS,OF,HORSES,S +7,NISITOST,ONK,WELMIME,RO,COTH,MEE,QUAT,BLESE,MIDIGOTER,SETHERS,CAG,FE,N +8,REE,FISMS,SOTURCIP,EN,EOKS,MOIFS,OS,TET,OBILL,FOM,NER,RECOLDIVES,N +9,CUDY,HOUT,EN,REE,CLOTE,FLONTERS,WIST,HERF,TEEP,UD,REE,BOMPEROTERT,N +10,AT,ONCE,A,BEVY,OF,DOGS,WAS,SNAPPING,AND,SNARLING,AROUND,HIM,S +11,AN,OBJECTIVE,SCALE,WAS,DEVELOPED,FOR,RATING,SCHOOL,NEIGHBORHOODS,FROM,THESE,DATA,S +12,WE,SPEND,MILLIONS,OF,DOLLARS,EVERY,YEAR,ON,FORTUNE,TELLERS,AND,SOOTHSAYERS,S +10,RORING,MOBONTS,OY,UNTEMNS,BRIBIS,REE,SEPHEGMITILITY,OY,MOTOCICOL,TEORERS,OS,OPERPROISING,N +11,CHE,PUCKED,UD,REE,CONY,OSE,NUFFLED,NIR,DAT,WOSK,TOTTLE,NENS,N +12,HU,JUBBED,UD,OSE,RURNED,OTOUSE,RO,MEE,REE,SOTOL,WOOR,PLUSING,N +13,TRORE,WAM,TOTTLE,CHORKS,ARYUMS,WOURN,URTER,PLIN,CHOFT,RORING,REE,GONTER,N +14,E,HUD,HELT,REE,DRORD,QUEY,DERE,ROXING,WHIMP,SOUSSING,REE,STOITS,N +15,HU,SELVED,MITHER,FRON,HEORN,REE,GOIR,FROT,SWECS,OTHISS,REE,ORIENCE,N +13,HE,SAT,DOWN,ON,AN,OLD,BOX,AND,FOCUSED,ON,THE,PROBLEM,S +14,THIS,IS,AN,ASSUMPTION,WITH,WHICH,FEW,WOULD,BE,DISPOSED,TO,QUARREL,S +15,IT,WAS,A,ROUGH,LONG,RIDE,THROUGH,THE,MUD,AND,POT,HOLES,S +16,QUEY,POFFED,PONCHES,FROT,DERE,FROPED,DORD,CREY,OGIMMS,REE,BLONS,HOLLS,N +17,WHIKE,RUPPER,OS,REE,CROUND,SEOD,OY,REE,COGGON,BLONS,RUPPER,FREOT,N +18,FINNA,CLIONLY,GOL,O,CROOM,OSE,PRORTED,RO,PREEP,UD,REE,SOCOR,N +16,COMPUTERS,ARE,BEING,USED,TO,KEEP,BRANCH,INVENTORIES,AT,MORE,WORKABLE,LEVELS,S +17,THERE,ARE,THOUSANDS,OF,SQUARE,MILES,OF,SALT,PAN,WHICH,ARE,HIDEOUS,S +18,I,WENT,TO,VISIT,ALFRED,IN,THE,KINGSTON,HOSPITAL,A,FEW,TIMES,S +19,HE,SEEMED,TO,BE,LOOKING,AT,A,POINT,ABOVE,THE,LITTLE,WINDOW,S +20,ALMOST,NO,EMPIRICAL,WORK,HAS,BEEN,DONE,ON,THE,PROBLEM,OF,ALIENATION,S +21,IF,WE,LOOK,AT,RECENT,ART,WE,FIND,IT,PREOCCUPIED,WITH,FORM,S +19,OD,REE,WOB,HU,SLUPPED,ET,REE,DELK,RO,REMIEVE,HUS,MOGE,N +20,PLIN,WAM,WONE,PITH,FUMS,KNORNEPTS,FROT,TRORE,WOURN,NE,VO,EDIGOGIC,N +21,AMSHED,WORVED,POMS,HOM,GOTHOUT,O,WOIL,OSE,GOL,INNI,REE,COS,N +22,OT,WAM,REE,NIFFS,DRAGTON,HUD,TROCKED,FREM,EN,REE,ROKER,GODE,N +23,CHE,WAM,FOUSE,REE,DUY,AULER,ET,REE,BOTTOP,OY,REE,FLITT,N +24,WIME,GONDOWS,PITH,PORY,WROLL,TEORED,POKED,SWECS,OTHISS,REE,USSER,PROSIES,N +22,YOU,COULD,WIN,A,POPULARITY,CONTEST,AT,THAT,SCHOOL,WITHOUT,ANY,TROUBLE,S +23,SUSAN,AND,JULIA,RIPPED,STRIPS,FROM,THEIR,CLOTHING,AND,BOUND,THE,INJURY,S +24,WE,HAD,BECOME,GOOD,FRIENDS,DURING,MY,STAY,AT,COOK,COUNTY,HOSPITAL,S diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set2.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set2.csv new file mode 100644 index 00000000..c88bfebf --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set2.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,REPPIES,OY,PLIN,TUTTER,DERE,MOMP,IMICTOBLE,RO,REE,PRELT,OSE,PEMLIC,N +2,NER,UCUOS,FOM,IMBRIGING,TIWRITION,CADE,PITH,REE,CHOVY,OY,SORD,CHOOTMENT,N +3,QUEY,ONK,FLUING,RO,DIDORSTROTE,SOTH,DETLERENT,WOSS,OY,FOUCHING,OSE,FOURNING,N +1,IT,BECAME,SO,MONOTONOUS,AS,TO,SEEM,A,PART,OF,THE,QUIETNESS,S +2,I,WAS,GRATEFUL,FOR,THEIR,INSIGHT,INTO,MY,NEED,FOR,THIS,EXPERIENCE,S +3,MANY,OF,THEM,HAVE,DRIFTED,INTO,THE,CITIES,AND,TOWNS,AND,SEAPORTS,S +4,LET,US,SPECULATE,A,LITTLE,ON,THE,MAXIMUM,SIZE,OF,THE,ANACONDA,S +5,THIS,WE,BELIEVE,WILL,SUBSTANTIALLY,BROADEN,THE,POTENTIAL,MARKET,FOR,THE,EQUIPMENT,S +6,HISTORIANS,HAVE,HAD,TWO,REASONS,FOR,PERSISTING,SO,LONG,IN,THEIR,INVESTIGATIONS,S +4,E,GORTED,RO,HERF,SU,FROT,DE,COURN,FIMP,TICE,RO,SCOY,N +5,QUEY,ONK,REE,MOGS,BOOLTISUL,POME,OY,FROT,TOTTLE,PIEKS,OY,LOTURE,N +6,HU,GORTED,RO,GE,BAPS,RO,HANHARD,FOM,OLUPSER,YLPH,OY,SCAYPHIDING,N +7,VISITORS,ARE,WELCOME,TO,COME,SEE,WHAT,THESE,DEDICATED,MOTHERS,CAN,DO,S +8,THE,FIRST,SATURDAY,IN,EACH,MONTH,IS,SET,ASIDE,FOR,NEW,RECORDINGS,S +9,BODY,HEAT,IN,THE,CLOSE,QUARTERS,WILL,HELP,KEEP,UP,THE,TEMPERATURE,S +7,AULER,FROT,QUEY,HUD,SUT,FOM,FOVE,SONUTES,GOTHOUT,RABING,O,WOIL,N +8,MOGS,OY,UR,WOURN,NE,WORTING,RO,ODCOT,FROT,FORSOCKNESS,CONCE,HOWN,N +9,VO,OVE,SUFFISTED,FROT,REE,ENSIFOL,EFFALLS,OY,REE,ADE,DERE,OUTULEVONT,N +10,DURING,MOMENTS,OF,INTENSE,CRISIS,THE,RESPONSIBILITY,OF,POLITICAL,LEADERS,IS,OVERWHELMING,S +11,SHE,PICKED,UP,THE,BABY,AND,NUZZLED,HER,FAT,WARM,LITTLE,NECK,S +12,HE,JUMPED,UP,AND,TURNED,AROUND,TO,SEE,THE,METAL,DOOR,CLOSING,S +10,REE,OTMOKNORE,OS,FROT,OY,OP,OMSTOCTIVE,PRIVINE,BEOLS,CLIB,ET,HOTH,N +11,CHE,HUD,ONDIVED,PLIN,SERNING,OSE,COTH,STROCHES,RO,REE,ELTKISH,WORPENS,N +12,QUEN,HU,GOTS,REE,ORSHERS,RO,HUS,SNUSTIONS,HU,WIST,NE,DENCEOROGED,N +13,DE,MOMP,OT,BAPS,RO,REE,HONBEOR,EN,LELT,FRON,FEIR,SONUTES,N +14,TRORE,OS,O,FUNCIBLE,TEEKING,EN,REE,ORF,OY,RELORSION,LOWOND,MELITICS,N +15,FREIR,RURIES,INSPODE,ELUSEITION,OY,REE,INRONGOTION,COTTOCTED,OSE,PRODENOTION,OY,RETORCORDOTIONS,N +13,THERE,WAS,LITTLE,CHANCE,ANYONE,WOULD,ENTER,THIS,SHAFT,DURING,THE,WINTER,S +14,I,HAD,FELT,THE,DRAFT,THEY,WERE,MAKING,WHILE,MOUNTING,THE,STAIRS,S +15,HE,SENSED,RATHER,THAN,HEARD,THE,GASP,THAT,SWEPT,ACROSS,THE,AUDIENCE,S +16,GOTHIN,WRO,WOOKS,WONGEN,WAM,ROSSING,REE,BEMS,ET,REE,OGGEY,GOKE,N +17,HU,BERIEW,EN,ROXING,ENWRORING,CLOUSSES,OSE,HU,MOMP,O,CREOT,PORY,N +18,DE,BOUSED,HUS,REGONDS,OSE,PLOWLS,FREM,EN,OOR,SCHOATS,OSE,AMILERSITIES,N +16,THEY,PASSED,RANCHES,THAT,WERE,FRAMED,DARK,GRAY,AGAINST,THE,BLACK,HILLS,S +17,WHITE,PEPPER,IS,THE,GROUND,SEED,OF,THE,COMMON,BLACK,PEPPER,FRUIT,S +18,LINDA,QUIETLY,GOT,A,BROOM,AND,STARTED,TO,SWEEP,UP,THE,SUGAR,S +19,ON,THE,WAY,HE,STOPPED,AT,THE,DESK,TO,RECEIVE,HIS,MAIL,S +20,THIS,WAS,DONE,WITH,FULL,KNOWLEDGE,THAT,THERE,WOULD,BE,NO,EPIDEMIC,S +21,ALFRED,WALKED,PAST,HIM,WITHOUT,A,WORD,AND,GOT,INTO,THE,CAR,S +19,REE,WILLSTIELD,WROSS,OS,FLOTTERFLOOK,OSE,CLEXIDROSS,OS,URTS,EN,REE,BOGIN,N +20,STUBONT,TEORERS,BENON,CLOCUDIC,EFFONCE,RO,BEFURIOTE,CLEODRE,INCOPRITION,PITEROL,MORCED,ADA,N +21,TOOD,SOSIO,DROLI,HOR,OTS,OPE,SPEVEOL,DENENDS,FROT,COYLY,NIED,REIMPEDURANTER,N +22,REE,POGOR,OS,FONKING,OT,OMEWOCK,RO,COMPOTHS,OGIMMS,HUS,OPE,REGOND,N +23,NIR,SETHER,OLPO,WAM,O,PITSON,OY,MIMORIOR,MOND,OSE,CLOOD,UNCEREVES,N +24,REE,UNTERS,LIPETISE,HUD,HIRMEN,THULPELVES,EN,REE,GROMP,OSE,REE,BRUBS,N +22,IT,WAS,THE,NIGHT,CLAYTON,HAD,TRICKED,THEM,IN,THE,POKER,GAME,S +23,SHE,WAS,FOUND,THE,DAY,AFTER,AT,THE,BOTTOM,OF,THE,CLIFF,S +24,WIDE,WINDOWS,WITH,MANY,SMALL,LEADED,PANES,SWEPT,ACROSS,THE,UPPER,STORIES,S diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set3.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set3.csv new file mode 100644 index 00000000..2f956c55 --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set3.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,SOTH,OY,REE,ESBONT,OS,SIND,OSE,OS,VOT,POODOBLE,FOM,FIDING,N +2,KNORNEPTS,GOWLED,TROR,CHOVYING,EERSHQUORT,WORED,HOR,BOUN,ODSTIED,EN,NONIOUS,FIECED,N +3,E,SNINK,HOO,ONK,COING,UNFOBS,RO,TADE,BLESE,SNINGS,UD,NOY,N +1,COPIES,OF,THIS,LETTER,WERE,MADE,AVALIABLE,TO,THE,PRESS,AND,PUBLIC,S +2,NEW,IDEAS,FOR,IMPROVING,NUTRITION,CAME,WITH,THE,STUDY,OF,SOIL,TREATMENT,S +3,THEY,ARE,TRYING,TO,DEMONSTRATE,SOME,DIFFERENT,WAYS,OF,TEACHING,AND,LEARNING,S +4,I,WANTED,TO,HELP,SO,THAT,WE,COULD,FIND,TIME,TO,PLAY,S +5,THEY,ARE,THE,MOST,BEAUTIFUL,PART,OF,THAT,LITTLE,PIECE,OF,NATURE,S +6,HE,WANTED,TO,GO,BACK,TO,HARVARD,FOR,ANOTHER,YEAR,OF,PLAYWRITING,S +4,REE,IMPORLENCE,OY,MORTIGRO,RIZE,EN,SUME,OOROMILS,HOR,BOUN,CLOROUGHLY,DIDORSTROTED,N +5,HU,CONCOLERED,OCISING,O,CAG,OY,BOUR,LUT,VEROUD,FROT,UCUO,SOO,N +6,NOVE,OY,UR,WAM,OFURE,FROT,REE,BIBBEST,FIFFS,WAM,STULL,OFO,N +7,AFTER,THAT,THEY,HAD,SAT,FOR,FIVE,MINUTES,WITHOUT,SAYING,A,WORD,S +8,MOST,OF,US,WOULD,BE,WILLING,TO,ADMIT,THAT,FORGIVENESS,COMES,HARD,S +9,NO,ONE,SUGGESTED,THAT,THE,ETHICAL,EFFECTS,OF,THE,ART,WERE,IRRELEVANT,S +7,E,WAM,BORSTANTLY,MEORTHING,FOM,CLUSK,OTOUSE,REE,NEOCHNOURPOOD,OY,REE,HORS,N +8,MONGHECERY,WREW,ORS,REE,VOLIENOL,TEORERS,UD,RO,REE,TICE,OY,KANGADY,N +9,REE,SOLCHOSERS,REHONTED,REE,URPER,EN,WRO,TUTTERS,PROTTEN,EN,NEPOROUS,TELDS,N +10,THE,ATMOSPHERE,IS,THAT,OF,AN,ATTRACTIVE,PRIVATE,BEACH,CLUB,AT,HOME,S +11,SHE,HAD,ARRIVED,THIS,MORNING,AND,COME,STRAIGHT,TO,THE,ENGLISH,GARDENS,S +12,WHEN,HE,GETS,THE,ANSWERS,TO,HIS,QUESTIONS,HE,WILL,BE,DISCOURAGED,S +10,REE,FLIOT,HUD,PABBED,REE,LOLKS,COORDLOOM,FOM,HORE,FRON,O,WOOK,N +11,BENGAN,GRINGED,OSE,SPIPPED,O,RECK,PITH,HUS,THURL,LOKE,O,MONDLE,N +12,TRORE,WAM,SOILCLING,MOCHES,OSE,BROKY,ONIET,OTS,MOROON,FROT,DISTOLVES,FREM,N +13,HU,SPINT,LONK,HEERS,LEFOSE,REE,PEREDISION,PLUTTING,OOT,PRERIDES,OY,RESONGE,N +14,HU,LOAKED,FOM,REE,COURCE,OY,REE,NOITS,FROT,HUD,OFUKENED,HOM,N +15,PLIN,EXCETSION,HOR,BOUN,WIING,OD,FOM,OP,OUTIKOTER,EILED,CULLION,YIERS,N +13,WE,MADE,IT,BACK,TO,THE,HARBOR,IN,LESS,THAN,FOUR,MINUTES,S +14,THERE,IS,A,TANGIBLE,FEELING,IN,THE,AIR,OF,REVULSION,TOWARD,POLITICS,S +15,THEIR,DUTIES,INCLUDE,EVALUATION,OF,THE,INFORMATION,COLLECTED,AND,PREPARATION,OF,RECOMMENDATIONS,S +16,NIR,FOPE,REEMED,RO,FLOOT,EN,OP,INSCABDIBLY,CLIGHT,CHOFT,OY,PONLIGHT,N +17,REE,RIZE,OY,WEOREN,MOURD,WIST,DETERBOLS,REE,OSIENT,OY,CROY,NOONED,N +18,E,TEEK,REE,PAPE,OSE,CHEVEL,OSE,WEWS,OOT,OD,REE,POLKS,N +16,WITHIN,TWO,WEEKS,WARREN,WAS,RINGING,THE,BELL,AT,THE,ABBEY,GATE,S +17,HE,BELIEVED,IN,MAKING,INSPIRING,SPEECHES,AND,HE,MADE,A,GREAT,MANY,S +18,WE,BOUGHT,HIS,RECORDS,AND,PLAYED,THEM,IN,OUR,SCHOOLS,AND,UNIVERSITIES,S +19,THE,WINDSHIELD,GLASS,IS,SHATTERPROOF,AND,PLEXIGLAS,IS,USED,IN,THE,CABIN,S +20,STUDENT,LEADERS,BEGAN,SPORADIC,EFFORTS,TO,NEGOTIATE,THEATER,INTEGRATION,SEVERAL,MONTHS,AGO,S +21,GOOD,RADIO,DRAMA,HAS,ITS,OWN,SPECIAL,DEMANDS,THAT,BADLY,NEED,REINVIGORATION,S +19,WOM,OS,REE,RECUNK,OY,MISSLIST,OSE,LONS,OY,OMPERPRONDING,BESHOON,DOOPLE,N +20,BRINSTON,RETIFONTS,HABS,BOUN,NOREROUS,CONSCIKETORS,RO,REE,FUKE,IBER,REE,YIERS,N +21,GRENCELCA,OSE,HERSORT,DERE,OBUNG,REE,WEW,DOOPLE,DE,WREW,EN,DORECONIO,N +22,PECTOR,APAL,WAM,TUZZY,UD,SNONT,PITH,SOTH,OY,HUS,LIMS,PORIENDS,N +23,SOTH,DOOPLE,CAG,COSMS,OLBUST,OTUBLING,OOT,OY,O,PIEKS,OY,DOOD,N +24,HU,DEES,VOT,REEM,RO,HABS,CAINTS,REE,SURQUENIES,OY,REE,LAN,N +22,THE,MAYOR,IS,FINDING,IT,AWKWARD,TO,CAMPAIGN,AGAINST,HIS,OWN,RECORD,S +23,HER,MOTHER,ALSO,WAS,A,PERSON,OF,SUPERIOR,MIND,AND,BROAD,INTERESTS,S +24,THE,OTHERS,LIKEWISE,HAD,HIDDEN,THEMSELVES,IN,THE,GRASS,AND,THE,BRUSH,S diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set4.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set4.csv new file mode 100644 index 00000000..b2b826c7 --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set4.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,QUEY,ONK,PIMNING,STROSH,CHESTURE,OD,FREIR,POLYLL,DENIMPMENTS,RO,TEEP,URPER,N +2,SANANAR,HOGURT,HULFSROY,OS,ODMIEITLY,O,LAN,PITH,O,ROUL,OSE,HEALT,N +3,E,FILLAWED,FREM,EN,REE,JOUP,OSE,NOY,QUEY,DOD,VOT,CANK,N +1,SOME,OF,THE,ISLAND,IS,SAND,AND,IS,NOT,SUITABLE,FOR,LIVING,S +2,KNOWLEDGE,GAINED,FROM,STUDYING,EARTHQUAKE,WAVES,HAS,BEEN,APPLIED,IN,VARIOUS,FIELDS,S +3,I,THINK,YOU,ARE,BEING,UNFAIR,TO,TAKE,THESE,THINGS,UP,NOW,S +4,THE,IMPORTANCE,OF,PARTICLE,SIZE,IN,SUCH,AEROSOLS,HAS,BEEN,THOROUGHLY,DEMONSTRATED,S +5,HE,CONSIDERED,OPENING,A,CAN,OF,BEER,BUT,VETOED,THAT,IDEA,TOO,S +6,NONE,OF,US,WAS,AWARE,THAT,THE,BIGGEST,FIGHT,WAS,STILL,AHEAD,S +4,SOMEMEMES,E,GUNESCO,OT,WAM,BEBAVES,REE,TAIN,SQUARS,HUD,CHAMMED,DICONTION,N +5,PLIN,MOPEGAAL,STUOCIGNES,URMER,USTRAPIOMUM,LIFFS,WHISS,BOMINITATES,OTS,MIMPLING,OSE,ASCISHMENT,N +6,HU,OS,BELLER,TOTTED,RO,PERFAID,HUS,SOVEAL,LIRR,OBUNG,HUS,FULLORN,N +7,I,WAS,CONSTANTLY,SEARCHING,FOR,CLUES,AROUND,THE,NEIGHBORHOOD,OF,THE,HALL,S +8,MONTGOMERY,KNEW,ALL,THE,NATIONAL,LEADERS,UP,TO,THE,TIME,OF,KENNEDY,S +9,THE,PURCHASERS,REJECTED,THE,ORDER,IN,TWO,LETTERS,WRITTEN,IN,VIGOROUS,TERMS,S +7,TROR,REE,OUTNODE,OT,WAM,OP,ODMITORY,ENEAUX,HOULE,OY,REE,GISTRY,N +8,CHE,ENVENIELLED,NOVE,OY,REE,SUSLANSE,OY,SOTH,POOM,STRINKER,MOLLING,ANCYGRACOKIAS,N +9,QUOTHER,HU,SABS,WOLL,OM,COYLY,HUD,NENTING,RO,FE,PITH,OT,N +10,THE,TRIAL,HAD,PACKED,THE,LARGE,COURTROOM,FOR,MORE,THAN,A,WEEK,S +11,BENSON,GRINNED,AND,FLIPPED,A,ROCK,WITH,HIS,THUMB,LIKE,A,MARBLE,S +12,THERE,WAS,SOMETHING,MAIMED,AND,CRAZY,ABOUT,ITS,MOTION,THAT,DISTURBED,THEM,S +10,HU,WEWS,RO,REE,SNONT,WOOR,OSE,OBONED,OT,OSE,LOAKED,EN,N +11,HU,CAINTS,NIR,CY,OP,ORL,OSE,HELVED,NIR,INNI,REE,LENCHEN,N +12,HU,TORD,HIMRALF,HU,HUD,NUDER,SEAN,WRO,DOOPLE,EET,SU,MUME,N +13,MIMP,SNOTCHED,O,PUNTOL,TROR,REE,HEAN,OY,FLITTERED,BOURY,OSE,DIRED,N +14,RUD,GABS,NIR,O,WOSK,DIT,OD,REE,SHEALPER,LEFOSE,HU,RETWEED,N +15,HU,GOL,O,WROLL,FIMS,PRORTED,OSE,DUT,OD,BABIN,OSE,COCTEE,N +13,HE,SPENT,LONG,HOURS,BEFORE,THE,TV,SPITTING,OUT,PROMISES,OF,REVENGE,S +14,HE,LOOKED,FOR,THE,SOURCE,OF,THE,NOISE,THAT,HAD,AWAKENED,HIM,S +15,THIS,EXPANSION,HAS,BEEN,GOING,ON,FOR,AN,ESTIMATED,EIGHT,BILLION,YEARS,S +16,REMOY,SAR,PONLIGHT,TEECH,REE,BIRLY,BLOIFS,HAITS,OD,REE,BRORN,SMIN,N +17,REE,MARENT,COMBANCED,RO,WOOP,OSE,OT,BLATCHED,REE,SELPS,OY,ENHEPMENT,N +18,THUN,HU,ASSINIGNED,DATSUO,CY,PESTING,OSE,CLIGGING,HIMRALF,UNTAT,HU,SUT,N +16,HER,FACE,SEEMED,TO,FLOAT,IN,AN,IMPLAUSIBLY,BRIGHT,SHAFT,OF,SUNLIGHT,S +17,THE,SIZE,OF,WOODEN,MOLD,WILL,DETERMINE,THE,AMOUNT,OF,CLAY,NEEDED,S +18,I,TOOK,THE,PAIL,AND,SHOVEL,AND,WENT,OUT,ON,THE,PORCH,S +19,WAR,IS,THE,RESULT,OF,MISTRUST,AND,LACK,OF,UNDERSTANDING,BETWEEN,PEOPLE,S +20,CRANSTON,RESIDENTS,HAVE,BEEN,GENEROUS,CONTRIBUTORS,TO,THE,FUND,OVER,THE,YEARS,S +21,FRANCESCA,AND,HERBERT,WERE,AMONG,THE,FEW,PEOPLE,WE,KNEW,IN,CATALONIA,S +19,E,SAR,HOM,MYBOLF,OSE,OT,WAM,WONE,AULER,CONCATLATION,PITH,CRIMWOLL,N +20,RO,HABS,SORNNAY,FROT,LOND,LEFORNED,WAM,QUAT,HU,HUD,LINCE,FOM,N +21,LUT,OT,WAM,SUME,O,NILT,PROUGHT,FROT,HU,GEDDED,HUS,HEAM,N +22,ADYTHE,MUTTLED,DOIL,RO,BEMIME,O,SOVEAL,MYME,OSE,O,FERMIBLE,EXAYBRA,N +23,AIRE,WILDED,UNTAT,REE,WOOR,HUD,SLANNED,OSE,PUCKED,UD,REE,COCTIEPOT,N +24,REE,EFFAKE,OD,REE,INTESCELLEALS,OBUNG,HUS,ORIENCE,MAK,WOLL,NE,IRAGENED,N +22,DOC,ABEL,WAS,BUSY,UP,FRONT,WITH,SOME,OF,HIS,LIVE,PATIENTS,S +23,SOME,PEOPLE,CAN,CARVE,ALMOST,ANYTHING,OUT,OF,A,PIECE,OF,WOOD,S +24,HE,DOES,NOT,SEEM,TO,HAVE,CAUGHT,THE,SUBTLETIES,OF,THE,MAN,S diff --git a/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set5.csv b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set5.csv new file mode 100644 index 00000000..165bebc8 --- /dev/null +++ b/brainscore_language/data/fedorenko2010_localization/langloc_fmri_run2_stim_set5.csv @@ -0,0 +1,49 @@ +stim1,stim2,stim3,stim4,stim5,stim6,stim7,stim8,stim9,stim10,stim11,stim12,stim13,stim14 +1,HU,GOVES,BRUDIT,FOM,REE,STOCOTION,RO,HUS,NER,OUTLEOF,OD,LIRR,N +2,NIED,FOM,NEVESRY,MAK,NE,O,SYMELOM,OY,CEDDUROL,FOTOLMS,OSE,INCRECORITY,N +3,HU,SUT,UD,OSE,WONCHED,AR,QUEY,PUBBED,THULPELVES,IBER,REE,STURN,N +1,THEY,ARE,PUTTING,STRONG,PRESSURE,ON,THEIR,POLICE,DEPARTMENTS,TO,KEEP,ORDER,S +2,SENATOR,HUBERT,HUMPHREY,IS,OBVIOUSLY,A,MAN,WITH,A,SOUL,AND,HEART,S +3,I,FOLLOWED,THEM,IN,THE,JEEP,AND,NOW,THEY,DID,NOT,CARE,S +4,SOMETIMES,I,GUESSED,IT,WAS,BECAUSE,THE,RAIN,SQUALL,HAD,CHANGED,DIRECTION,S +5,THIS,MATERIAL,FLUORESCES,UNDER,ULTRAVIOLET,LIGHT,WHICH,FACILITATES,ITS,SAMPLING,AND,ASSESSMENT,S +6,HE,IS,BETTER,FITTED,TO,PERFORM,HIS,SOCIAL,LIFE,AMONG,HIS,FELLOWS,S +4,HU,LELD,REE,RERE,OY,HUS,SNINGS,OSE,LEFORNED,RO,REE,LOMPY,N +5,SHONESE,OSE,ENCIOD,MERSHOCKS,OTHISS,REE,STREOP,DERE,SLOSSING,FREIR,STOUL,CHOTTERS,N +6,O,CORGLE,OTIVES,EN,REE,ORF,DIRIDES,OTS,FLOPE,OSE,SMOLE,UGWONDS,N +7,FROM,THE,OUTSIDE,IT,WAS,AN,ORDINARY,ENOUGH,HOUSE,OF,THE,GENTRY,S +8,SHE,EXPERIENCED,NONE,OF,THE,SUSPENSE,OF,SOME,POOR,STRANGER,SELLING,ENCYCLOPEDIAS,S +9,WHETHER,HE,SANG,WELL,OR,BADLY,HAD,NOTHING,TO,DO,WITH,IT,S +7,PLIN,DINMIRMITY,REDNEFENTS,O,DITCEROTE,OLLEMPT,RO,STOFILOLS,O,TORMFESSLY,URCHADLE,ERCIRULMENT,N +8,REE,UNTER,POGRENS,DERE,MOXI,CLIRERS,OSE,ADE,STUBONTS,OSE,WROLL,FRILLKOOPERS,N +9,MIMP,POFFED,SCROUGH,OT,OSE,MOFFS,LOWOND,REE,DORD,MEMP,OY,HURTES,N +10,HE,WENT,TO,THE,FRONT,DOOR,AND,OPENED,IT,AND,LOOKED,IN,S +11,HE,CAUGHT,HER,BY,AN,ARM,AND,HELPED,HER,INTO,THE,KITCHEN,S +12,HE,TOLD,HIMSELF,HE,HAD,NEVER,SEEN,TWO,PEOPLE,EAT,SO,MUCH,S +10,ET,ORKS,O,BUPY,OY,DEGS,WAM,SWIPPING,OSE,SNURVING,OTOUSE,HOM,N +11,OP,OTFOCTIVE,SCOSS,WAM,DEVESYSMS,FOM,RUMING,SCHOAT,NEOCHNOURPOODS,TROR,BLESE,ROLO,N +12,DE,SPIND,MILLOOTS,OY,PELLORS,ELKBY,YLPH,OD,FORBONE,TUNKERS,OSE,SOOSTGOYERS,N +13,HU,SUT,DOIL,OD,OP,ORN,BOF,OSE,FOCECKED,OD,REE,TROPLEN,N +14,PLIN,OS,OP,OTTURPTION,PITH,WHISS,WEW,WOURN,NE,DISLODES,RO,SWORRUL,N +15,OT,WAM,O,MOUGH,LONK,RIMP,SCROUGH,REE,RUD,OSE,MOT,HOWED,N +13,MIKE,SNATCHED,A,PISTOL,FROM,THE,HEAP,OF,SCATTERED,BOOTY,AND,FIRED,S +14,ROD,GAVE,HER,A,WARM,PAT,ON,THE,SHOULDER,BEFORE,HE,REPLIED,S +15,HE,GOT,A,SMALL,FIRE,STARTED,AND,PUT,ON,BACON,AND,COFFEE,S +16,CORMUNERS,ONK,COING,URTS,RO,TEEP,BROFFS,INMINCONIES,ET,HORE,NUNKOBLE,TUVELS,N +17,TRORE,ONK,CLAUPANDS,OY,SQUONK,MIPES,OY,SOFF,POS,WHISS,ONK,FUTEOUS,N +18,E,WEWS,RO,JISIL,AMSHED,EN,REE,KINGSTON,FOBBITOL,O,WEW,TIVES,N +16,RAMEY,SAW,SUNLIGHT,TOUCH,THE,CURLY,BLONDE,HAIRS,ON,THE,BROWN,SKIN,S +17,THE,MARINE,COMMENCED,TO,WEEP,AND,IT,BLIGHTED,THE,SENSE,OF,ENJOYMENT,S +18,THEN,HE,ASTONISHED,MATSUO,BY,PUSHING,AND,DRAGGING,HIMSELF,UNTIL,HE,SAT,S +19,I,SAW,HIM,MYSELF,AND,IT,WAS,DONE,AFTER,CONSULTATION,WITH,CROMWELL,S +20,TO,HAVE,SOMEDAY,THAT,LOVE,RETURNED,WAS,WHAT,HE,HAD,LIVED,FOR,S +21,BUT,IT,WAS,SUCH,A,NICE,THOUGHT,THAT,HE,NODDED,HIS,HEAD,S +19,HU,REEMED,RO,NE,LOAKING,ET,O,POITS,OPOKE,REE,TOTTLE,GONDOW,N +20,OLBUST,VO,ELLYRINOL,WOFT,HOR,BOUN,WONE,OD,REE,TROPLEN,OY,IMUILOTION,N +21,EF,DE,LOAK,ET,RECURE,ADE,DE,FIMP,OT,PREOTBUVOED,PITH,FOGE,N +22,HOO,COURN,WIP,O,SYLUCOLITY,CORTETS,ET,FROT,SCHOAT,GOTHOUT,EDY,TROOMLE,N +23,SUBON,OSE,JUROE,MIPPED,STREPS,TROR,FREIR,FRITHING,OSE,BOUSE,REE,ENCUFY,N +24,DE,HUD,BEMIME,TOOD,FRIESTS,RORING,HY,STOK,ET,COOB,COURRY,FOBBITOL,N +22,EDYTHE,SETTLED,DOWN,TO,BECOME,A,SOCIAL,MYTH,AND,A,HORRIBLE,EXAMPLE,S +23,ANNE,WAITED,UNTIL,THE,DOOR,HAD,SLAMMED,AND,PICKED,UP,THE,COFFEEPOT,S +24,THE,EFFECT,ON,THE,INTELLECTUALS,AMONG,HIS,AUDIENCE,MAY,WELL,BE,IMAGINED,S diff --git a/brainscore_language/model_helpers/huggingface.py b/brainscore_language/model_helpers/huggingface.py index 84576853..96618f4e 100644 --- a/brainscore_language/model_helpers/huggingface.py +++ b/brainscore_language/model_helpers/huggingface.py @@ -17,6 +17,7 @@ from brainscore_language.artificial_subject import ArtificialSubject from brainscore_language.model_helpers.preprocessing import prepare_context from brainscore_language.utils import fullname +from brainscore_language.model_helpers.localize import localize_fed10 class HuggingfaceSubject(ArtificialSubject): @@ -26,6 +27,8 @@ def __init__( region_layer_mapping: dict, model=None, tokenizer=None, + use_localizer=False, + localizer_kwargs=None, task_heads: Union[None, Dict[ArtificialSubject.Task, Callable]] = None, ): """ @@ -41,6 +44,7 @@ def __init__( """ self._logger = logging.getLogger(fullname(self)) self.model_id = model_id + self.use_localizer = use_localizer self.region_layer_mapping = region_layer_mapping self.basemodel = (model if model is not None else AutoModelForCausalLM.from_pretrained(self.model_id)) self.device = 'cuda' if torch.cuda.is_available() else 'cpu' @@ -59,6 +63,18 @@ def __init__( } self.task_function_mapping_dict = {**task_mapping_default, **task_heads} if task_heads else task_mapping_default + if self.use_localizer: + layer_names = region_layer_mapping["language_system"] + self.language_mask = localize_fed10(model_id=self.model_id, + model=self.basemodel, + tokenizer=self.tokenizer, + layer_names=layer_names, + top_k=localizer_kwargs["top_k"], + batch_size=localizer_kwargs["batch_size"], + hidden_dim=localizer_kwargs["hidden_dim"], + device=self.device + ).flatten() + def identifier(self): return self.model_id @@ -122,6 +138,13 @@ def digest_text(self, text: Union[str, List[str]]) -> Dict[str, DataAssembly]: if output['behavior'] else None output['neural'] = xr.concat(output['neural'], dim='presentation').sortby('part_number') \ if output['neural'] else None + + if self.neural_recordings and self.use_localizer: + num_presentations = output['neural'].data.shape[0] + output['neural-mask'] = output['neural'].copy() + output['neural-mask'].data = np.repeat(self.language_mask[np.newaxis,:], num_presentations, axis=0) + output['neural'] = output['neural'].where(output['neural-mask'], drop=True) + return output def _prepare_context(self, context_parts): @@ -190,11 +213,16 @@ def _setup_hooks(self): hooks = [] layer_representations = OrderedDict() for (recording_target, recording_type) in self.neural_recordings: - layer_name = self.region_layer_mapping[recording_target] - layer = self._get_layer(layer_name) - hook = self._register_hook(layer, key=(recording_target, recording_type, layer_name), - target_dict=layer_representations) - hooks.append(hook) + layer_names = self.region_layer_mapping[recording_target] + if type(layer_names) == str: + layer_names = [layer_names] + + for layer_idx, layer_name in enumerate(layer_names): + layer = self._get_layer(layer_name) + hook = self._register_hook(layer, key=(f"{recording_target}.{layer_idx}", recording_type, layer_name), + target_dict=layer_representations) + hooks.append(hook) + return hooks, layer_representations def output_to_representations(self, layer_representations: Dict[Tuple[str, str, str], np.ndarray], stimuli_coords): diff --git a/brainscore_language/model_helpers/localize.py b/brainscore_language/model_helpers/localize.py new file mode 100644 index 00000000..d27bed8f --- /dev/null +++ b/brainscore_language/model_helpers/localize.py @@ -0,0 +1,174 @@ +from typing import List +from collections import OrderedDict + +import os +import scipy +import torch +import logging +import numpy as np +import transformers +import pandas as pd + +from glob import glob +from tqdm import tqdm +from torch.utils.data import Dataset, DataLoader +from pathlib import Path + +from brainscore_language import load_dataset + +BRAINIO_CACHE = os.environ.get("BRAINIO", f"{Path.home()}/.brainio") +os.environ["TOKENIZERS_PARALLELISM"] = "False" + +logger = logging.getLogger(__name__) + +# Code adapted from: https://github.com/bkhmsi/brain-language-suma + +class Fed10_langlocDataset(Dataset): + def __init__(self): + self.num_samples = 240 + + data = load_dataset("Fedorenko2010.localization") + self.sentences = data[data["stim14"]=="S"]["sent"] + self.non_words = data[data["stim14"]=="N"]["sent"] + + def __getitem__(self, idx): + return self.sentences.iloc[idx].strip(), self.non_words.iloc[idx].strip() + + def __len__(self): + return len(self.sentences) + +def _get_layer(module, layer_name: str) -> torch.nn.Module: + SUBMODULE_SEPARATOR = '.' + for part in layer_name.split(SUBMODULE_SEPARATOR): + module = module._modules.get(part) + assert module is not None, f"No submodule found for layer {layer_name}, at part {part}" + return module + +def _register_hook(layer: torch.nn.Module, + key: str, + target_dict: dict): + # instantiate parameters to function defaults; otherwise they would change on next function call + def hook_function(_layer: torch.nn.Module, _input, output: torch.Tensor, key=key): + # fix for when taking out only the hidden state, this is different from dropout because of residual state + # see: https://github.com/huggingface/transformers/blob/c06d55564740ebdaaf866ffbbbabf8843b34df4b/src/transformers/models/gpt2/modeling_gpt2.py#L428 + output = output[0] if isinstance(output, (tuple, list)) else output + target_dict[key] = output + + hook = layer.register_forward_hook(hook_function) + return hook + +def setup_hooks(model, layer_names): + """ set up the hooks for recording internal neural activity from the model (aka layer activations) """ + hooks = [] + layer_representations = OrderedDict() + + for layer_name in layer_names: + layer = _get_layer(model, layer_name) + hook = _register_hook(layer, key=layer_name, + target_dict=layer_representations) + hooks.append(hook) + + return hooks, layer_representations + +def extract_batch( + model: torch.nn.Module, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + layer_names: List[str], +): + + batch_activations = {layer_name: [] for layer_name in layer_names} + hooks, layer_representations = setup_hooks(model, layer_names) + + with torch.no_grad(): + _ = model(input_ids=input_ids, attention_mask=attention_mask) + + for sample_idx in range(len(input_ids)): + for layer_idx, layer_name in enumerate(layer_names): + activations = layer_representations[layer_name][sample_idx][-1].cpu() + batch_activations[layer_name] += [activations] + + for hook in hooks: + hook.remove() + + return batch_activations + +def extract_representations( + model: torch.nn.Module, + tokenizer: transformers.PreTrainedTokenizer, + layer_names: List[str], + hidden_dim: int, + batch_size: int, + device: torch.device, +): + langloc_dataset = Fed10_langlocDataset() + + # Get the activations of the model on the dataset + langloc_dataloader = DataLoader(langloc_dataset, batch_size=batch_size, num_workers=0) + + logger.debug(f"> Using Device: {device}") + + model.eval() + model.to(device) + + final_layer_representations = { + "sentences": {layer_name: np.zeros((langloc_dataset.num_samples, hidden_dim)) for layer_name in layer_names}, + "non-words": {layer_name: np.zeros((langloc_dataset.num_samples, hidden_dim)) for layer_name in layer_names} + } + + for batch_idx, batch_data in tqdm(enumerate(langloc_dataloader)): + + sents, non_words = batch_data + sent_tokens = tokenizer(sents, truncation=True, max_length=12, return_tensors='pt').to(device) + non_words_tokens = tokenizer(non_words, truncation=True, max_length=12, return_tensors='pt').to(device) + assert sent_tokens.input_ids.size(1) == non_words_tokens.input_ids.size(1) + + batch_real_actv = extract_batch(model, sent_tokens["input_ids"], sent_tokens["attention_mask"], layer_names) + batch_rand_actv = extract_batch(model, non_words_tokens["input_ids"], non_words_tokens["attention_mask"], layer_names) + + for layer_name in layer_names: + final_layer_representations["sentences"][layer_name][batch_idx*batch_size:(batch_idx+1)*batch_size] = torch.stack(batch_real_actv[layer_name]).numpy() + final_layer_representations["non-words"][layer_name][batch_idx*batch_size:(batch_idx+1)*batch_size] = torch.stack(batch_rand_actv[layer_name]).numpy() + + return final_layer_representations + +def localize_fed10(model_id: str, + model: torch.nn.Module, + top_k: int, + tokenizer: transformers.PreTrainedTokenizer, + hidden_dim: int, + layer_names: List[str], + batch_size: int, + device: torch.device, +): + """ + Localize the model by selecting the top `top_k` units. + """ + + save_path = f"{BRAINIO_CACHE}/{model_id}_language_mask.npy" + + if os.path.exists(save_path): + logger.debug(f"Loading language mask from {save_path}") + return np.load(save_path) + + representations = extract_representations(model, tokenizer, layer_names, hidden_dim, batch_size, device) + + p_values_matrix = np.zeros((len(layer_names), hidden_dim)) + t_values_matrix = np.zeros((len(layer_names), hidden_dim)) + + for layer_idx, layer_name in tqdm(enumerate(layer_names)): + + sentences_actv = representations["sentences"][layer_name] + non_words_actv = representations["non-words"][layer_name] + + t_values_matrix[layer_idx], p_values_matrix[layer_idx] = scipy.stats.ttest_ind(sentences_actv, non_words_actv, axis=0, equal_var=False) + + def is_topk(a, k=1): + _, rix = np.unique(-a, return_inverse=True) + return np.where(rix < k, 1, 0).reshape(a.shape) + + language_mask = is_topk(t_values_matrix, k=top_k) + + np.save(save_path, language_mask) + logger.debug(f"{model_id} language mask cached to {save_path}") + return language_mask diff --git a/examples/score_localization.py b/examples/score_localization.py new file mode 100644 index 00000000..eda4d02d --- /dev/null +++ b/examples/score_localization.py @@ -0,0 +1,26 @@ +from tqdm import tqdm +from brainscore_language import load_benchmark +from brainscore_language.model_helpers.huggingface import HuggingfaceSubject +from brainscore_language import ArtificialSubject + +benchmark = load_benchmark('Pereira2018.243sentences-linear') + +num_blocks = 12 +layer_names = [f'transformer.h.{block}.{layer_type}' + for block in range(num_blocks) + for layer_type in ['ln_1', 'attn', 'ln_2', 'mlp'] +] + +layer_model = HuggingfaceSubject(model_id='gpt2', + region_layer_mapping={ArtificialSubject.RecordingTarget.language_system: layer_names}, + use_localizer=True, + localizer_kwargs={ + 'hidden_dim': 768, + 'batch_size': 16, + "top_k": 4096, + } +) + +layer_score = benchmark(layer_model) + +print(layer_score) \ No newline at end of file From f98c6abef663fb03ddf208b9cfd524a172f10af4 Mon Sep 17 00:00:00 2001 From: Badr AlKhamissi Date: Sat, 6 Jul 2024 07:43:04 +0200 Subject: [PATCH 2/8] changed variable names in localization example --- examples/score_localization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/score_localization.py b/examples/score_localization.py index eda4d02d..b7dd1d22 100644 --- a/examples/score_localization.py +++ b/examples/score_localization.py @@ -11,7 +11,7 @@ for layer_type in ['ln_1', 'attn', 'ln_2', 'mlp'] ] -layer_model = HuggingfaceSubject(model_id='gpt2', +model = HuggingfaceSubject(model_id='gpt2', region_layer_mapping={ArtificialSubject.RecordingTarget.language_system: layer_names}, use_localizer=True, localizer_kwargs={ @@ -21,6 +21,6 @@ } ) -layer_score = benchmark(layer_model) +model_score = benchmark(model) -print(layer_score) \ No newline at end of file +print(model_score) \ No newline at end of file From 5961de06d89acc7084f680cc7dc396861b1ee204 Mon Sep 17 00:00:00 2001 From: Badr AlKhamissi Date: Fri, 19 Jul 2024 04:58:36 -0400 Subject: [PATCH 3/8] Update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 8a001910..8be6bb82 100644 --- a/.gitignore +++ b/.gitignore @@ -136,6 +136,7 @@ dmypy.json ### project specific additions: +brainscore_language/data html .vscode *.code-workspace @@ -148,4 +149,4 @@ cache .cache .idea/ wandb/ -**/models/lm1b/resources \ No newline at end of file +**/models/lm1b/resources From da7672e5581a732c127c82df6fb457f51ad1898e Mon Sep 17 00:00:00 2001 From: Badr AlKhamissi Date: Thu, 8 Aug 2024 09:21:09 -0400 Subject: [PATCH 4/8] added comments --- brainscore_language/data/fedorenko2010_localization/__init__.py | 1 + brainscore_language/model_helpers/localize.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/brainscore_language/data/fedorenko2010_localization/__init__.py b/brainscore_language/data/fedorenko2010_localization/__init__.py index 22a51d27..ff7bf276 100644 --- a/brainscore_language/data/fedorenko2010_localization/__init__.py +++ b/brainscore_language/data/fedorenko2010_localization/__init__.py @@ -27,6 +27,7 @@ def load_data(): data["sent"] = data["stim2"].apply(str.lower) for stimuli_idx in range(3, 14): + # lowercase each stimulus/word then add it to the sentence data["sent"] += " " + data[f"stim{stimuli_idx}"].apply(str.lower) return data diff --git a/brainscore_language/model_helpers/localize.py b/brainscore_language/model_helpers/localize.py index d27bed8f..83177349 100644 --- a/brainscore_language/model_helpers/localize.py +++ b/brainscore_language/model_helpers/localize.py @@ -16,8 +16,8 @@ from brainscore_language import load_dataset +# To cache the language mask BRAINIO_CACHE = os.environ.get("BRAINIO", f"{Path.home()}/.brainio") -os.environ["TOKENIZERS_PARALLELISM"] = "False" logger = logging.getLogger(__name__) From d35d254b72d87be1daa36dc7bcdbcc0899917a90 Mon Sep 17 00:00:00 2001 From: Badr AlKhamissi Date: Mon, 12 Aug 2024 17:24:53 +0700 Subject: [PATCH 5/8] removed num_samples from Fed10_langlocDataset --- brainscore_language/model_helpers/localize.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/brainscore_language/model_helpers/localize.py b/brainscore_language/model_helpers/localize.py index 83177349..47134461 100644 --- a/brainscore_language/model_helpers/localize.py +++ b/brainscore_language/model_helpers/localize.py @@ -25,8 +25,6 @@ class Fed10_langlocDataset(Dataset): def __init__(self): - self.num_samples = 240 - data = load_dataset("Fedorenko2010.localization") self.sentences = data[data["stim14"]=="S"]["sent"] self.non_words = data[data["stim14"]=="N"]["sent"] @@ -112,8 +110,8 @@ def extract_representations( model.to(device) final_layer_representations = { - "sentences": {layer_name: np.zeros((langloc_dataset.num_samples, hidden_dim)) for layer_name in layer_names}, - "non-words": {layer_name: np.zeros((langloc_dataset.num_samples, hidden_dim)) for layer_name in layer_names} + "sentences": {layer_name: np.zeros((len(langloc_dataset.sentences), hidden_dim)) for layer_name in layer_names}, + "non-words": {layer_name: np.zeros((len(langloc_dataset.sentences), hidden_dim)) for layer_name in layer_names} } for batch_idx, batch_data in tqdm(enumerate(langloc_dataloader)): From a07c9d233dbd938fb91003b80d21cdda2dbcf793 Mon Sep 17 00:00:00 2001 From: Badr AlKhamissi Date: Sun, 18 Aug 2024 19:39:30 +0200 Subject: [PATCH 6/8] SUMA now supported --- .gitignore | 1 + brainscore_language/model_helpers/localize.py | 5 +- .../model_helpers/modeling_suma.py | 1176 +++++++++++++++++ brainscore_language/models/suma/__init__.py | 30 + examples/score_suma.py | 13 + 5 files changed, 1222 insertions(+), 3 deletions(-) create mode 100644 brainscore_language/model_helpers/modeling_suma.py create mode 100644 brainscore_language/models/suma/__init__.py create mode 100644 examples/score_suma.py diff --git a/.gitignore b/.gitignore index 8be6bb82..69b1c768 100644 --- a/.gitignore +++ b/.gitignore @@ -150,3 +150,4 @@ cache .idea/ wandb/ **/models/lm1b/resources +conda_score--* diff --git a/brainscore_language/model_helpers/localize.py b/brainscore_language/model_helpers/localize.py index 47134461..a56eea55 100644 --- a/brainscore_language/model_helpers/localize.py +++ b/brainscore_language/model_helpers/localize.py @@ -7,7 +7,6 @@ import logging import numpy as np import transformers -import pandas as pd from glob import glob from tqdm import tqdm @@ -114,7 +113,7 @@ def extract_representations( "non-words": {layer_name: np.zeros((len(langloc_dataset.sentences), hidden_dim)) for layer_name in layer_names} } - for batch_idx, batch_data in tqdm(enumerate(langloc_dataloader)): + for batch_idx, batch_data in tqdm(enumerate(langloc_dataloader), total=len(langloc_dataloader)): sents, non_words = batch_data sent_tokens = tokenizer(sents, truncation=True, max_length=12, return_tensors='pt').to(device) @@ -154,7 +153,7 @@ def localize_fed10(model_id: str, p_values_matrix = np.zeros((len(layer_names), hidden_dim)) t_values_matrix = np.zeros((len(layer_names), hidden_dim)) - for layer_idx, layer_name in tqdm(enumerate(layer_names)): + for layer_idx, layer_name in tqdm(enumerate(layer_names), total=len(layer_names)): sentences_actv = representations["sentences"][layer_name] non_words_actv = representations["non-words"][layer_name] diff --git a/brainscore_language/model_helpers/modeling_suma.py b/brainscore_language/model_helpers/modeling_suma.py new file mode 100644 index 00000000..b4d702a4 --- /dev/null +++ b/brainscore_language/model_helpers/modeling_suma.py @@ -0,0 +1,1176 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch SUMA model adapted from LLaMA.""" +import math +import warnings +import numpy as np +from typing import List, Optional, Tuple, Union, Any, Dict + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss +from pandas import read_pickle + +from transformers.activations import ACT2FN +from transformers.modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_causal_attention_mask, +) + +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_outputs import ModelOutput, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) + +from dataclasses import dataclass + +@dataclass +class BaseModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + langnet_states: Optional[Tuple[torch.FloatTensor]] = None + internal_states: Optional[Tuple[torch.FloatTensor]] = None + +def custom_init_weights(module, method, variance: float=1): + if isinstance(module, nn.Linear) or isinstance(module, nn.Embedding): + if method == "uniform": + nn.init.uniform_(module.weight) + elif method == "normal": + nn.init.normal_(module.weight, std=variance) + elif method == "xavier_uniform": + nn.init.xavier_uniform_(module.weight, gain=variance) + elif method == "xavier_normal": + nn.init.xavier_normal_(module.weight, gain=variance) + elif method == "kaiming_uniform": + nn.init.kaiming_uniform_(module.weight) + elif method == "kaiming_normal": + nn.init.kaiming_normal_(module.weight) + elif method == "orthogonal": + nn.init.orthogonal_(module.weight, gain=variance) + + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.fill_(0) # this was 0.01 + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "SUMAConfig" + +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" SUMA model configuration""" + +logger = logging.get_logger(__name__) + +class SUMAConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the LLaMA-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + + ```python + >>> from transformers import LlamaModel, SUMAConfig + + >>> # Initializing a LLaMA llama-7b style configuration + >>> configuration = SUMAConfig() + + >>> # Initializing a model from the llama-7b style configuration + >>> model = LlamaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "llama" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + num_cycles=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.num_cycles = num_cycles + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + +class Cache: + """ + Base, abstract class for all caches. The actual data structure is specific to each subclass. + """ + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. These are specific to each subclass and allow new types of + cache to be created. + + Return: + A tuple containing the updated key and value states. + """ + raise NotImplementedError("Make sure to implement `update` in a subclass.") + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.") + + def get_max_length(self) -> Optional[int]: + """Returns the maximum sequence length of the cached states, if there is any.""" + raise NotImplementedError("Make sure to implement `get_max_length` in a subclass.") + + def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int: + """Given the sequence length of the new inputs, returns the usable length of the cache.""" + # Cache without size limit -> all cache is usable + # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache + # length, we will need to evict part of the cache (and thus not all cache is usable) + max_length = self.get_max_length() + previous_seq_length = self.get_seq_length(layer_idx) + if max_length is not None and previous_seq_length + new_seq_length > max_length: + return max_length - new_seq_length + return previous_seq_length + +class DynamicCache(Cache): + """ + A cache that grows dynamically as more tokens are generated. This is the default for generative models. + + It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is + `[batch_size, num_heads, seq_len, head_dim]`. + """ + + def __init__(self) -> None: + self.key_cache: List[torch.Tensor] = [] + self.value_cache: List[torch.Tensor] = [] + self.seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen + + def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]: + """ + Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the + sequence length. + """ + if layer_idx < len(self): + return (self.key_cache[layer_idx], self.value_cache[layer_idx]) + else: + raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") + + def __iter__(self): + """ + Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over + keys and values + """ + for layer_idx in range(len(self)): + yield (self.key_cache[layer_idx], self.value_cache[layer_idx]) + + def __len__(self): + """ + Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds + to the number of layers in the model. + """ + return len(self.key_cache) + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`. + + Return: + A tuple containing the updated key and value states. + """ + # Update the number of seen tokens + if layer_idx == 0: + self.seen_tokens += key_states.shape[-2] + + # Update the cache + if len(self.key_cache) <= layer_idx: + self.key_cache.append(key_states) + self.value_cache.append(value_states) + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + if len(self.key_cache) <= layer_idx: + return 0 + return self.key_cache[layer_idx].shape[-2] + + def get_max_length(self) -> Optional[int]: + """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length.""" + return None + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + for layer_idx in range(len(self.key_cache)): + device = self.key_cache[layer_idx].device + self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) + device = self.value_cache[layer_idx].device + self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) + + def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: + """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format.""" + legacy_cache = () + for layer_idx in range(len(self)): + legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),) + return legacy_cache + + @classmethod + def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache": + """Converts a cache in the legacy cache format into an equivalent `DynamicCache`.""" + cache = cls() + if past_key_values is not None: + for layer_idx in range(len(past_key_values)): + key_states, value_states = past_key_values[layer_idx] + cache.update(key_states, value_states, layer_idx) + return cache + +class LlamaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + LlamaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + +ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm) + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + +class LlamaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: SUMAConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + self.use_pos_emb = False + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + if self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + ######### Badr Edits to account for custom AttentionMaskConverter ########## + # if self.add_attn_mask: + attn_weights = attn_weights + attention_mask + # else: + # attn_weights = attn_weights * attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + + # attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + +LLAMA_ATTENTION_CLASSES = { + "eager": LlamaAttention, +} + +class SUMADecoderLayer(nn.Module): + def __init__(self, config: SUMAConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + internal_states = [] + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + internal_states += [hidden_states] + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + + internal_states += [hidden_states] + + hidden_states = residual + hidden_states + + outputs = (hidden_states, internal_states) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +LLAMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`SUMAConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class LlamaPreTrainedModel(PreTrainedModel): + config_class = SUMAConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["SUMADecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +LLAMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare LLaMA Model outputting raw hidden-states without any specific head on top.", + LLAMA_START_DOCSTRING, +) +class SUMAModel(LlamaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`SUMADecoderLayer`] + + Args: + config: SUMAConfig + """ + + def __init__(self, config: SUMAConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.num_cycles = config.num_cycles + self.language_mask = None + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [SUMADecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self._use_sdpa = config._attn_implementation == "sdpa" + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def set_language_mask(self, language_mask): + self.language_mask = torch.tensor(language_mask.flatten()) + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + num_cycles: Optional[int] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + if num_cycles is None: + if type(self.num_cycles) == str and "dynamic" in self.num_cycles: + if self.num_cycles == "dynamic": + num_cycles = int(np.ceil(hidden_states.size(1) / 8)) + else: + factor = int(self.num_cycles.split("-")[1]) + num_cycles = int(np.ceil(hidden_states.size(1) / factor)) + + elif self.num_cycles == "default": + num_cycles = 1 + else: + num_cycles = int(self.num_cycles) + + internal_states = [] + for _ in range(num_cycles): + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + internal_states.extend(layer_outputs[1]) + + if use_cache: + next_decoder_cache = layer_outputs[3 if output_attentions else 2] + + if output_attentions: + all_self_attns += (layer_outputs[2],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.language_mask is not None: + internal_states = torch.cat(internal_states, dim=-1) + langnet_states = internal_states[:,:,self.language_mask.bool()] + else: + langnet_states = hidden_states + + internal_states = torch.stack(internal_states) + + next_cache = None + + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, langnet_states] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + langnet_states=langnet_states, + internal_states=internal_states, + ) + +class SUMAForCausalLM(LlamaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.use_cache = config.use_cache + self.model = SUMAModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, LlamaForCausalLM + + >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + use_cache = self.use_cache + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + hidden_states = outputs.langnet_states + + # hidden_states = outputs.langnet_states + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0) + logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)] + logits = torch.cat(logits, dim=-1) + else: + learned_hidden_states = self.lm_base(inputs_embeds=hidden_states, attention_mask=attention_mask)[0] + logits = self.lm_head(learned_hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) #+ outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past \ No newline at end of file diff --git a/brainscore_language/models/suma/__init__.py b/brainscore_language/models/suma/__init__.py new file mode 100644 index 00000000..4063bbb9 --- /dev/null +++ b/brainscore_language/models/suma/__init__.py @@ -0,0 +1,30 @@ +from brainscore_language import model_registry +from brainscore_language import ArtificialSubject +from brainscore_language.model_helpers.huggingface import HuggingfaceSubject +from brainscore_language.model_helpers.modeling_suma import SUMAModel, SUMAConfig +from transformers import AutoTokenizer + +layer_names = [f'layers.{layer_num}.{layer_desc}' + for layer_num in range(1) + for layer_desc in ["input_layernorm", "self_attn"] +] + +model_registry['suma'] = lambda: HuggingfaceSubject( + model_id='suma', + model=SUMAModel( + config=SUMAConfig( + num_hidden_layers=1, + num_attention_heads=512, + num_key_value_heads=512, + num_cycles=2, + ) + ), + tokenizer=AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf'), + region_layer_mapping={ArtificialSubject.RecordingTarget.language_system: layer_names}, + use_localizer=True, + localizer_kwargs={ + 'hidden_dim': 4096, + 'batch_size': 16, + "top_k": 4096, + } +) \ No newline at end of file diff --git a/examples/score_suma.py b/examples/score_suma.py new file mode 100644 index 00000000..edbdce54 --- /dev/null +++ b/examples/score_suma.py @@ -0,0 +1,13 @@ +from brainscore_language import score + +model_score = score(model_identifier='suma', benchmark_identifier='Pereira2018.243sentences-linear') +print(model_score) + +''' +array(0.98581247) +Attributes: + raw: \narray(0.34876988) + ceiling: \narray(0.35378928) + model_identifier: suma + benchmark_identifier: Pereira2018.243sentences-linear +''' \ No newline at end of file From aa4fac8a7664e3738eabb7e03bc38f9ac442df2b Mon Sep 17 00:00:00 2001 From: Badr AlKhamissi Date: Sun, 18 Aug 2024 19:43:49 +0200 Subject: [PATCH 7/8] added support for ridge regression --- .../metrics/linear_predictivity/__init__.py | 3 ++- .../metrics/linear_predictivity/metric.py | 14 +++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/brainscore_language/metrics/linear_predictivity/__init__.py b/brainscore_language/metrics/linear_predictivity/__init__.py index 30826d9f..b7a9d9e6 100644 --- a/brainscore_language/metrics/linear_predictivity/__init__.py +++ b/brainscore_language/metrics/linear_predictivity/__init__.py @@ -1,4 +1,5 @@ from brainscore_language import metric_registry -from .metric import linear_pearsonr +from .metric import linear_pearsonr, ridge_pearsonr metric_registry['linear_pearsonr'] = linear_pearsonr +metric_registry['ridge_pearsonr'] = ridge_pearsonr \ No newline at end of file diff --git a/brainscore_language/metrics/linear_predictivity/metric.py b/brainscore_language/metrics/linear_predictivity/metric.py index be574e46..c0dcd251 100644 --- a/brainscore_language/metrics/linear_predictivity/metric.py +++ b/brainscore_language/metrics/linear_predictivity/metric.py @@ -1,6 +1,6 @@ import numpy as np import scipy.stats -from sklearn.linear_model import LinearRegression +from sklearn.linear_model import LinearRegression, RidgeCV from sklearn.preprocessing import scale from brainio.assemblies import NeuroidAssembly, array_is_element, DataAssembly @@ -157,6 +157,11 @@ def __call__(self, source: DataAssembly, target: DataAssembly) -> Score: coord: (dims, value) for coord, dims, value in walk_coords(target)}, dims=target.dims) return self.cross_regressed_correlation(source, target) +def ridge_regression(xarray_kwargs=None): + regression = RidgeCV(alphas=[10 ** x for x in range(-10, 10)]) + xarray_kwargs = xarray_kwargs or {} + regression = XarrayRegression(regression, **xarray_kwargs) + return regression def linear_regression(xarray_kwargs=None): regression = LinearRegression() @@ -164,13 +169,16 @@ def linear_regression(xarray_kwargs=None): regression = XarrayRegression(regression, **xarray_kwargs) return regression - def pearsonr_correlation(xarray_kwargs=None): xarray_kwargs = xarray_kwargs or {} return XarrayCorrelation(scipy.stats.pearsonr, **xarray_kwargs) - def linear_pearsonr(*args, regression_kwargs=None, correlation_kwargs=None, **kwargs): regression = linear_regression(regression_kwargs or {}) correlation = pearsonr_correlation(correlation_kwargs or {}) return CrossRegressedCorrelation(*args, regression=regression, correlation=correlation, **kwargs) + +def ridge_pearsonr(*args, regression_kwargs=None, correlation_kwargs=None, **kwargs): + regression = ridge_regression(regression_kwargs or {}) + correlation = pearsonr_correlation(correlation_kwargs or {}) + return CrossRegressedCorrelation(*args, regression=regression, correlation=correlation, **kwargs) From 856f53073a583edf7fa2e2347bcfe41e5919ecde Mon Sep 17 00:00:00 2001 From: Badr AlKhamissi Date: Sun, 18 Aug 2024 19:50:01 +0200 Subject: [PATCH 8/8] added rdm and cka metrics --- brainscore_language/metrics/cka/__init__.py | 4 + brainscore_language/metrics/cka/metric.py | 111 ++++++++++++++++++++ brainscore_language/metrics/rdm/__init__.py | 13 +++ brainscore_language/metrics/rdm/metric.py | 106 +++++++++++++++++++ 4 files changed, 234 insertions(+) create mode 100644 brainscore_language/metrics/cka/__init__.py create mode 100644 brainscore_language/metrics/cka/metric.py create mode 100644 brainscore_language/metrics/rdm/__init__.py create mode 100644 brainscore_language/metrics/rdm/metric.py diff --git a/brainscore_language/metrics/cka/__init__.py b/brainscore_language/metrics/cka/__init__.py new file mode 100644 index 00000000..ad2dd7ec --- /dev/null +++ b/brainscore_language/metrics/cka/__init__.py @@ -0,0 +1,4 @@ +from brainscore_language import metric_registry +from .metric import CKACrossValidated + +metric_registry['cka'] = CKACrossValidated \ No newline at end of file diff --git a/brainscore_language/metrics/cka/metric.py b/brainscore_language/metrics/cka/metric.py new file mode 100644 index 00000000..49f97cf0 --- /dev/null +++ b/brainscore_language/metrics/cka/metric.py @@ -0,0 +1,111 @@ +import math +import numpy as np + +from brainscore_core.metrics import Score +from brainscore_language.utils.transformations import TestOnlyCrossValidation + + +class Defaults: + expected_dims = ('presentation', 'neuroid') + stimulus_coord = 'stimulus_id' + neuroid_dim = 'neuroid' + neuroid_coord = 'neuroid_id' + +def centering(K): + n = K.shape[0] + unit = np.ones([n, n]) + I = np.eye(n) + H = I - unit / n + + return np.dot(np.dot(H, K), H) + # HKH are the same with KH, KH is the first centering, H(KH) do the second time, + # results are the sme with one time centering + # return np.dot(H, K) # KH + + +def rbf(X, sigma=None): + GX = np.dot(X, X.T) + KX = np.diag(GX) - GX + (np.diag(GX) - GX).T + if sigma is None: + mdist = np.median(KX[KX != 0]) + sigma = math.sqrt(mdist) + KX *= - 0.5 / (sigma * sigma) + KX = np.exp(KX) + return KX + + +def kernel_HSIC(X, Y, sigma): + return np.sum(centering(rbf(X, sigma)) * centering(rbf(Y, sigma))) + + +def linear_HSIC(X, Y): + L_X = np.dot(X, X.T) + L_Y = np.dot(Y, Y.T) + return np.sum(centering(L_X) * centering(L_Y)) + + +def linear_CKA(X, Y): + hsic = linear_HSIC(X, Y) + var1 = np.sqrt(linear_HSIC(X, X)) + var2 = np.sqrt(linear_HSIC(Y, Y)) + + return hsic / (var1 * var2) + + +def kernel_CKA(X, Y, sigma=None): + hsic = kernel_HSIC(X, Y, sigma) + var1 = np.sqrt(kernel_HSIC(X, X, sigma)) + var2 = np.sqrt(kernel_HSIC(Y, Y, sigma)) + + return hsic / (var1 * var2) + +class CKAMetric: + """ + Computes a similarity index for the similarity between two assemblies with centered kernel alignment (CKA). + + Kornblith et al., 2019 http://proceedings.mlr.press/v97/kornblith19a/kornblith19a.pdf + """ + + def __init__(self, comparison_coord=Defaults.stimulus_coord): + self._comparison_coord = comparison_coord + + def __call__(self, assembly1, assembly2): + """ + :param brainscore.assemblies.NeuroidAssembly assembly1: + :param brainscore.assemblies.NeuroidAssembly assembly2: + :return: brainscore.assemblies.DataAssembly + """ + # ensure value order + assembly1 = assembly1.sortby(self._comparison_coord) + assembly2 = assembly2.sortby(self._comparison_coord) + assert (assembly1[self._comparison_coord].values == assembly2[self._comparison_coord].values).all() + # ensure dimensions order + dims = assembly1[self._comparison_coord].dims + np.testing.assert_array_equal(assembly2[self._comparison_coord].dims, dims) + assembly1 = assembly1.transpose(*(list(dims) + [dim for dim in assembly1.dims if dim not in dims])) + assembly2 = assembly2.transpose(*(list(dims) + [dim for dim in assembly2.dims if dim not in dims])) + similarity = linear_CKA(assembly1, assembly2) + return Score(similarity) + +class CKACrossValidated: + """ + Computes a cross-validated similarity index for the similarity between two assemblies + with centered kernel alignment (CKA). + + Kornblith et al., 2019 http://proceedings.mlr.press/v97/kornblith19a/kornblith19a.pdf + """ + + def __init__(self, comparison_coord=Defaults.stimulus_coord, crossvalidation_kwargs=None): + self._metric = CKAMetric(comparison_coord=comparison_coord) + crossvalidation_defaults = dict(test_size=.9) # leave 10% out + crossvalidation_kwargs = {**crossvalidation_defaults, **(crossvalidation_kwargs or {})} + self._cross_validation = TestOnlyCrossValidation(**crossvalidation_kwargs) + + def __call__(self, assembly1, assembly2): + """ + :param brainio.assemblies.NeuroidAssembly assembly1: + :param brainio.assemblies.NeuroidAssembly assembly2: + :return: brainscore.metrics.Score + """ + + return self._cross_validation(assembly1, assembly2, apply=self._metric) \ No newline at end of file diff --git a/brainscore_language/metrics/rdm/__init__.py b/brainscore_language/metrics/rdm/__init__.py new file mode 100644 index 00000000..f53f5f8d --- /dev/null +++ b/brainscore_language/metrics/rdm/__init__.py @@ -0,0 +1,13 @@ +from brainscore_language import metric_registry +from .metric import RDMCrossValidated + +metric_registry['rdm'] = RDMCrossValidated + +BIBTEX = """@article{kriegeskorte2008representational, + title={Representational similarity analysis-connecting the branches of systems neuroscience}, + author={Kriegeskorte, Nikolaus and Mur, Marieke and Bandettini, Peter A}, + journal={Frontiers in systems neuroscience}, + pages={4}, + year={2008}, + publisher={Frontiers} +}""" \ No newline at end of file diff --git a/brainscore_language/metrics/rdm/metric.py b/brainscore_language/metrics/rdm/metric.py new file mode 100644 index 00000000..e5a46be4 --- /dev/null +++ b/brainscore_language/metrics/rdm/metric.py @@ -0,0 +1,106 @@ +import numpy as np +from scipy.stats import spearmanr + +from brainio.assemblies import DataAssembly, walk_coords, NeuroidAssembly +from brainscore_core.metrics import Metric, Score +from brainscore_language.utils.transformations import TestOnlyCrossValidation + +class XarrayDefaults: + expected_dims = ('presentation', 'neuroid') + stimulus_coord = 'stimulus_id' + neuroid_dim = 'neuroid' + neuroid_coord = 'neuroid_id' + +class RDMCrossValidated(Metric): + """ + Computes a coefficient for the similarity between two `RDM`s, using the upper triangular regions + + Kriegeskorte et al., 2008 https://doi.org/10.3389/neuro.06.004.2008 + """ + + def __init__(self, neuroid_dim=XarrayDefaults.neuroid_dim, comparison_coord=XarrayDefaults.stimulus_coord, + crossvalidation_kwargs=None): + self._metric = RDMMetric(neuroid_dim=neuroid_dim, comparison_coord=comparison_coord) + crossvalidation_defaults = dict(test_size=.9) # leave 10% out + # crossvalidation_defaults = dict(train_size=.9, test_size=None) + crossvalidation_kwargs = {**crossvalidation_defaults, **(crossvalidation_kwargs or {})} + self._cross_validation = TestOnlyCrossValidation(**crossvalidation_kwargs) + + def __call__(self, assembly1: NeuroidAssembly, assembly2: NeuroidAssembly) -> Score: + return self._cross_validation(assembly1, assembly2, apply=self._metric) + + +class RDMMetric(Metric): + """ + Computes a coefficient for the similarity between two `RDM`s, using the upper triangular regions + + Kriegeskorte et al., 2008 https://doi.org/10.3389/neuro.06.004.2008 + """ + + def __init__(self, neuroid_dim=XarrayDefaults.neuroid_dim, comparison_coord=XarrayDefaults.stimulus_coord): + self._neuroid_dim = neuroid_dim + self._rdm = RDM(neuroid_dim=neuroid_dim) + self._similarity = RDMSimilarity(comparison_coord=comparison_coord) + + def __call__(self, assembly1: NeuroidAssembly, assembly2: NeuroidAssembly) -> Score: + rdm1 = self._rdm(assembly1) + rdm2 = self._rdm(assembly2) + similarity = self._similarity(rdm1, rdm2) + return Score(similarity) + + +class RDM: + """ + Representational Dissimilarity Matrix. + Converts an assembly of `presentation x neuroid` into a `neuroid x neuroid` RDM. + + Kriegeskorte et al., 2008 https://doi.org/10.3389/neuro.06.004.2008 + """ + + def __init__(self, neuroid_dim=XarrayDefaults.neuroid_dim): + self._neuroid_dim = neuroid_dim + + def __call__(self, assembly): + assert len(assembly.dims) == 2 + correlations = np.corrcoef(assembly) if assembly.dims[-1] == self._neuroid_dim else np.corrcoef(assembly.T).T + coords = {coord: coord_value for coord, coord_value in assembly.coords.items() if coord != self._neuroid_dim} + dims = [dim if dim != self._neuroid_dim else assembly.dims[(i - 1) % len(assembly.dims)] + for i, dim in enumerate(assembly.dims)] + similarities = DataAssembly(correlations, coords=coords, dims=dims) + return 1 - similarities + + +class RDMSimilarity: + def __init__(self, comparison_coord=XarrayDefaults.stimulus_coord): + self._comparison_coord = comparison_coord + + def __call__(self, rdm_assembly1, rdm_assembly2): + # align + rdm_assembly1 = self.multishape_preserved_sort(rdm_assembly1) + rdm_assembly2 = self.multishape_preserved_sort(rdm_assembly2) + assert (rdm_assembly1[self._comparison_coord].values == rdm_assembly2[self._comparison_coord].values).all() + + triu1 = self._triangulars(rdm_assembly1.values) + triu2 = self._triangulars(rdm_assembly2.values) + corr, p = spearmanr(triu1, triu2) + return corr + + def _triangulars(self, values): + assert len(values.shape) == 2 and values.shape[0] == values.shape[1] + # ensure diagonal is zero + diag = np.diag(values) + diag = np.nan_to_num(diag, nan=0, copy=True) # we also accept nans in the diagonal from correlating zeros + np.testing.assert_almost_equal(diag, 0) + # index and retrieve upper triangular + triangular_indices = np.triu_indices(values.shape[0], k=1) + return values[triangular_indices] + + def multishape_preserved_sort(self, assembly): + comparison_dims = assembly[self._comparison_coord].dims + assert set(assembly.dims) == set(comparison_dims), "multi-dimensional case not implemented" + indices = np.argsort(assembly[self._comparison_coord].values) + assembly = type(assembly)(assembly.values[np.ix_(indices, indices)], + coords={coord: (dims, values[indices] if dims == comparison_dims else values) + for coord, dims, values in walk_coords(assembly)}, + dims=assembly.dims) + return assembly \ No newline at end of file