6
6
from sklearn .feature_selection import chi2
7
7
from sklearn .feature_extraction .text import CountVectorizer
8
8
from tmu .models .classification .vanilla_classifier import TMClassifier
9
+ from scipy .sparse import csr_matrix
9
10
10
11
from tmu .tools import BenchmarkTimer
11
12
@@ -63,26 +64,33 @@ def main(args):
63
64
tokenizer = lambda s : s ,
64
65
token_pattern = None ,
65
66
ngram_range = (1 , args .max_ngram ),
67
+ max_features = 100000 ,
66
68
lowercase = False ,
67
69
binary = True
68
70
)
69
71
70
- X_train = vectorizer_X .fit_transform (training_documents )
72
+ X_train = vectorizer_X .fit_transform (training_documents ). astype ( np . uint32 )
71
73
Y_train = train_y .astype (np .uint32 )
72
74
73
- X_test = vectorizer_X .transform (testing_documents )
75
+ X_test = vectorizer_X .transform (testing_documents ). astype ( np . uint32 )
74
76
Y_test = test_y .astype (np .uint32 )
75
77
_LOGGER .info ("Producing bit representation... Done!" )
76
78
77
79
_LOGGER .info ("Selecting Features...." )
78
80
79
- SKB = SelectKBest (chi2 , k = args .features )
80
- SKB .fit (X_train , Y_train )
81
+ # SKB = SelectKBest(chi2, k=args.features)
82
+ # SKB.fit(X_train, Y_train)
81
83
82
- selected_features = SKB .get_support (indices = True )
83
- X_train = SKB .transform (X_train ).astype (np .uint32 )
84
- X_test = SKB .transform (X_test ).astype (np .uint32 )
84
+ selected_features = np .arange (args .features )
85
+ #selected_features = SKB.get_support(indices=True)
86
+ #X_train = SKB.transform(X_train).astype(np.uint32)
87
+ #X_test = SKB.transform(X_test).astype(np.uint32)
85
88
89
+ documents = [["movie" , "all" ], ["very" , "good" ], ["love" , "the" , "book" ]]
90
+ print (documents )
91
+ concepts = vectorizer_X .transform (documents )
92
+ print (concepts )
93
+
86
94
_LOGGER .info ("Selecting Features.... Done!" )
87
95
88
96
tm = TMClassifier (
@@ -91,7 +99,8 @@ def main(args):
91
99
args .s ,
92
100
platform = args .platform ,
93
101
weighted_clauses = args .weighted_clauses ,
94
- clause_drop_p = args .clause_drop_p
102
+ clause_drop_p = args .clause_drop_p ,
103
+ sets = concepts #csr_matrix([[1,8],[0,1],[15,128]])
95
104
)
96
105
97
106
for e in range (args .epochs ):
0 commit comments