Coverage for picos/expressions/samples.py: 76.45%

242 statements  

« prev     ^ index     » next       coverage.py v6.5.0, created at 2023-03-26 07:46 +0000

1# ------------------------------------------------------------------------------ 

2# Copyright (C) 2020 Maximilian Stahlberg 

3# 

4# This file is part of PICOS. 

5# 

6# PICOS is free software: you can redistribute it and/or modify it under the 

7# terms of the GNU General Public License as published by the Free Software 

8# Foundation, either version 3 of the License, or (at your option) any later 

9# version. 

10# 

11# PICOS is distributed in the hope that it will be useful, but WITHOUT ANY 

12# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 

13# A PARTICULAR PURPOSE. See the GNU General Public License for more details. 

14# 

15# You should have received a copy of the GNU General Public License along with 

16# this program. If not, see <http://www.gnu.org/licenses/>. 

17# ------------------------------------------------------------------------------ 

18 

19"""Implements :class:`Samples`.""" 

20 

21import random 

22 

23import cvxopt 

24 

25from .. import glyphs 

26from ..apidoc import api_end, api_start 

27from ..caching import cached_property 

28from .data import cvxopt_hcat, load_data, load_shape 

29from .exp_affine import ComplexAffineExpression, Constant 

30from .expression import Expression 

31 

32_API_START = api_start(globals()) 

33# ------------------------------- 

34 

35 

36class Samples(): 

37 """A collection of data points. 

38 

39 :Example: 

40 

41 >>> from picos.expressions import Samples 

42 >>> # Load the column-major vectorization of six matrices. 

43 >>> data = [[[1*i, 3*i], 

44 ... [2*i, 4*i]] for i in range(1, 7)] 

45 >>> S = Samples(data) 

46 >>> S 

47 <Samples: (6 4-dimensional samples)> 

48 >>> [S.num, S.dim, S.original_shape] # Metadata. 

49 [6, 4, (2, 2)] 

50 >>> S.matrix # All samples as the columns of one matrix. 

51 <4×6 Real Constant: [4×6]> 

52 >>> print(S.matrix) 

53 [ 1.00e+00 2.00e+00 3.00e+00 4.00e+00 5.00e+00 6.00e+00] 

54 [ 2.00e+00 4.00e+00 6.00e+00 8.00e+00 1.00e+01 1.20e+01] 

55 [ 3.00e+00 6.00e+00 9.00e+00 1.20e+01 1.50e+01 1.80e+01] 

56 [ 4.00e+00 8.00e+00 1.20e+01 1.60e+01 2.00e+01 2.40e+01] 

57 >>> print(S[0].T) # The first sample (transposed for brevity). 

58 [ 1.00e+00 2.00e+00 3.00e+00 4.00e+00] 

59 >>> print(S.mean.T) # The sample mean (transposed for brevity). 

60 [ 3.50e+00 7.00e+00 1.05e+01 1.40e+01] 

61 >>> print(S.covariance) # The sample covariance matrix. 

62 [ 3.50e+00 7.00e+00 1.05e+01 1.40e+01] 

63 [ 7.00e+00 1.40e+01 2.10e+01 2.80e+01] 

64 [ 1.05e+01 2.10e+01 3.15e+01 4.20e+01] 

65 [ 1.40e+01 2.80e+01 4.20e+01 5.60e+01] 

66 >>> print(S.original[0]) # The first sample in its original shape. 

67 [ 1.00e+00 3.00e+00] 

68 [ 2.00e+00 4.00e+00] 

69 >>> U = S.select([0, 2, 4]) # Select a subset of samples by indices. 

70 >>> print(U.matrix) 

71 [ 1.00e+00 3.00e+00 5.00e+00] 

72 [ 2.00e+00 6.00e+00 1.00e+01] 

73 [ 3.00e+00 9.00e+00 1.50e+01] 

74 [ 4.00e+00 1.20e+01 2.00e+01] 

75 >>> T, V = S.partition() # Split into training and validation samples. 

76 >>> print(T.matrix) 

77 [ 1.00e+00 2.00e+00 3.00e+00] 

78 [ 2.00e+00 4.00e+00 6.00e+00] 

79 [ 3.00e+00 6.00e+00 9.00e+00] 

80 [ 4.00e+00 8.00e+00 1.20e+01] 

81 >>> print(V.matrix) 

82 [ 4.00e+00 5.00e+00 6.00e+00] 

83 [ 8.00e+00 1.00e+01 1.20e+01] 

84 [ 1.20e+01 1.50e+01 1.80e+01] 

85 [ 1.60e+01 2.00e+01 2.40e+01] 

86 """ 

87 

88 def __new__(cls, samples=None, forced_original_shape=None, **kwargs): 

89 """Prepare a :class:`Samples` instance.""" 

90 if isinstance(samples, cls): 

91 if forced_original_shape is not None: 

92 forced_shape = load_shape(forced_original_shape) 

93 if forced_shape[0]*forced_shape[1] != samples.dim: 

94 raise ValueError("Incompatible forced original shape.") 

95 

96 if forced_shape == samples.original_shape: 

97 # Shapes are consistent, return the existing instance. 

98 return samples 

99 else: 

100 # Make a shallow copy and change only the original shape. 

101 self = object.__new__(cls) 

102 self._cached_cvx_mat = samples._cached_cvx_mat 

103 self._cached_cvx_vec = samples._cached_cvx_vec 

104 self._cached_pic_mat = samples._cached_pic_mat 

105 self._cached_pic_vec = samples._cached_pic_vec 

106 self._original_shape = forced_shape 

107 return self 

108 else: 

109 # Return the existing instance. 

110 return samples 

111 else: 

112 # Return a new instance. 

113 self = object.__new__(cls) 

114 self._cached_cvx_mat = None 

115 self._cached_cvx_vec = None 

116 self._cached_pic_mat = None 

117 self._cached_pic_vec = None 

118 self._original_shape = None 

119 return self 

120 

121 def __init__(self, samples, forced_original_shape=None, always_copy=True): 

122 """Load a number of data points (samples). 

123 

124 :param samples: 

125 Any of the following: 

126 

127 - A tuple or list of constants, each of which denotes a sample 

128 vector. Matrices are vectorized but their :attr:`original_shape` 

129 is stored and may be used by PICOS internally. 

130 - A constant row or column vector whose entries denote scalar 

131 samples. 

132 - A constant matrix whose columns denote the samples. 

133 - Another :class:`Samples` instance. If possible, it is returned as 

134 is (:class:`Samples` instances are immutable), otherwise a shallow 

135 copy with the necessary modifications is returned instead. 

136 

137 In any case, constants may be given as constant numeric data values 

138 (anything recognized by :func:`~.data.load_data`) or as constant 

139 PICOS expressions. 

140 

141 :param forced_original_shape: 

142 Overwrites :attr:`original_shape` with the given shape. 

143 

144 :param bool always_copy: 

145 If this is :obj:`False`, then data that is provided in the form of 

146 CVXOPT types is not copied but referenced if possible. This can 

147 speed up instance creation but will introduce inconsistencies if the 

148 original data is modified. Note that this argument has no impact if 

149 the ``samples`` argument already is a :class:`Samples` instance; in 

150 this case data is never copied. 

151 """ 

152 if isinstance(samples, Samples): 

153 # Handled in __new__. 

154 return 

155 elif isinstance(samples, (tuple, list)): 

156 if not samples: 

157 raise ValueError("Need at least one sample.") 

158 

159 if all(isinstance(s, (int, float, complex)) for s in samples): 

160 # Efficiently handle a list of scalars. 

161 self._cached_cvx_mat = load_data(samples)[0].T 

162 elif all(isinstance(s, ComplexAffineExpression) 

163 and s.constant for s in samples): 

164 if len(set(s.shape for s in samples)) != 1: 

165 raise ValueError("Cannot load samples of differing shapes.") 

166 

167 self._original_shape = samples[0].shape 

168 self._cached_pic_vec = tuple(s.vec for s in samples) 

169 else: 

170 samples = tuple( 

171 load_data(s, alwaysCopy=always_copy)[0] for s in samples) 

172 

173 if len(set(s.size for s in samples)) != 1: 

174 raise ValueError("Cannot load samples of differing shapes.") 

175 

176 self._original_shape = samples[0].size 

177 self._cached_cvx_vec = tuple( 

178 s if s.size[1] == 1 else s[:] for s in samples) 

179 elif isinstance(samples, Expression): 

180 samples = samples.refined 

181 

182 if not isinstance(samples, ComplexAffineExpression): 

183 raise TypeError("Can only extract samples from a (constant) " 

184 "affine expression, not from an instance of {}." 

185 .format(type(samples).__name__)) 

186 

187 if not samples.constant: 

188 raise TypeError("Can only extract samples from a constant " 

189 "expression, {} is not constant.".format(samples.string)) 

190 

191 self._cached_pic_mat = samples 

192 

193 # Treat any vector as a number of scalar samples. 

194 if self._cached_pic_mat.shape[1] == 1: 

195 self._cached_pic_mat = self._cached_pic_mat.T 

196 else: 

197 self._cached_cvx_mat = load_data(samples, alwaysCopy=always_copy)[0] 

198 

199 # Treat any vector as a number of scalar samples. 

200 if self._cached_cvx_mat.size[1] == 1: 

201 self._cached_cvx_mat = self._cached_cvx_mat.T 

202 

203 assert any(samples is not None for samples in ( 

204 self._cached_cvx_vec, 

205 self._cached_pic_vec, 

206 self._cached_cvx_mat, 

207 self._cached_pic_mat)) 

208 

209 if forced_original_shape is not None: 

210 forced_shape = load_shape(forced_original_shape) 

211 if forced_shape[0]*forced_shape[1] != self.dim: 

212 raise ValueError("Incompatible forced original shape.") 

213 self._original_shape = forced_shape 

214 

215 def __len__(self): 

216 """Number of samples.""" 

217 return self.num 

218 

219 def __str__(self): 

220 return glyphs.parenth( 

221 "{} {}-dimensional samples".format(self.num, self.dim)) 

222 

223 def __repr__(self): 

224 return glyphs.repr2("Samples", self.__str__()) 

225 

226 def __getitem__(self, i): 

227 return self.vectors[i] 

228 

229 def __iter__(self): 

230 for vector in self.vectors: 

231 yield vector 

232 

233 @property 

234 def _cvxopt_matrix(self): 

235 """A CVXOPT dense or sparse matrix whose columns are the samples. 

236 

237 This cached property is used by PICOS internally as accessing the CVXOPT 

238 value of a constant PICOS expression would create a copy of the data. 

239 

240 .. warning:: 

241 

242 :class:`Sample` instances are supposed to be immutable, so you are 

243 expected not to modify the returned CVXOPT objects. 

244 """ 

245 if self._cached_cvx_mat is not None: 

246 pass 

247 elif self._cached_pic_mat is not None: 

248 self._cached_cvx_mat = self._cached_pic_mat.value_as_matrix 

249 elif self._cached_cvx_vec is not None: 

250 self._cached_cvx_mat = cvxopt_hcat(self._cached_cvx_vec) 

251 else: 

252 self._cached_cvx_mat = cvxopt_hcat( 

253 [s.value_as_matrix for s in self._cached_pic_vec]) 

254 

255 return self._cached_cvx_mat 

256 

257 @property 

258 def _cvxopt_vectors(self): 

259 """A :class:`tuple` containing the samples as CVXOPT column vectors. 

260 

261 This cached property is used by PICOS internally as accessing the CVXOPT 

262 value of a constant PICOS expression would create a copy of the data. 

263 

264 .. warning:: 

265 

266 :class:`Sample` instances are supposed to be immutable, so you are 

267 expected not to modify the returned CVXOPT objects. 

268 """ 

269 if self._cached_cvx_vec is not None: 

270 pass 

271 elif self._cached_cvx_mat is not None: 

272 self._cached_cvx_vec = tuple(self._cached_cvx_mat[:, i] 

273 for i in range(self._cached_cvx_mat.size[1])) 

274 elif self._cached_pic_vec is not None: 

275 self._cached_cvx_vec = tuple( 

276 s.value_as_matrix for s in self._cached_pic_vec) 

277 else: 

278 # We need to convert from a PICOS to a CVXOPT matrix, do so in a way 

279 # that caches the result. 

280 _ = self._cvxopt_matrix 

281 assert self._cached_cvx_mat is not None 

282 

283 self._cached_cvx_vec = tuple(self._cached_cvx_mat[:, i] 

284 for i in range(self._cached_cvx_mat.size[1])) 

285 

286 return self._cached_cvx_vec 

287 

288 @property 

289 def matrix(self): 

290 """A matrix whose columns are the samples.""" 

291 if self._cached_pic_mat is not None: 

292 pass 

293 else: 

294 self._cached_pic_mat = Constant(self._cvxopt_matrix) 

295 

296 return self._cached_pic_mat 

297 

298 @property 

299 def vectors(self): 

300 """A :class:`tuple` containing the samples as column vectors.""" 

301 if self._cached_pic_vec is not None: 

302 pass 

303 else: 

304 self._cached_pic_vec = tuple( 

305 Constant(s) for s in self._cvxopt_vectors) 

306 

307 return self._cached_pic_vec 

308 

309 @cached_property 

310 def original(self): 

311 """A :class:`tuple` containing the samples in their original shape.""" 

312 shape = self.original_shape 

313 

314 if shape[1] == 1: 

315 return self.vectors 

316 else: 

317 return tuple(sample.reshaped(shape) for sample in self) 

318 

319 @property 

320 def dim(self): 

321 """Sample dimension.""" 

322 if self._cached_cvx_mat is not None: 

323 return self._cached_cvx_mat.size[0] 

324 elif self._cached_pic_mat is not None: 

325 return self._cached_pic_mat.shape[0] 

326 elif self._cached_cvx_vec is not None: 

327 # NOTE: len() counts nonzero entries for sparse matrices. 

328 return self._cached_cvx_vec[0].size[0] 

329 else: 

330 return len(self._cached_pic_vec[0]) 

331 

332 @property 

333 def num(self): 

334 """Number of samples.""" 

335 if self._cached_cvx_mat is not None: 

336 return self._cached_cvx_mat.size[1] 

337 elif self._cached_pic_mat is not None: 

338 return self._cached_pic_mat.shape[1] 

339 elif self._cached_cvx_vec is not None: 

340 return len(self._cached_cvx_vec) 

341 else: 

342 return len(self._cached_pic_vec) 

343 

344 @property 

345 def original_shape(self): 

346 """Original shape of the samples before vectorization.""" 

347 if self._original_shape is None: 

348 self._original_shape = (self.dim, 1) 

349 

350 return self._original_shape 

351 

352 @cached_property 

353 def mean(self): 

354 """The sample mean as a column vector.""" 

355 return Constant(sum(self._cvxopt_vectors) / self.num) 

356 

357 @cached_property 

358 def covariance(self): 

359 """The sample covariance matrix.""" 

360 if self.num == 1: 

361 return cvxopt.spmatrix([], [], [], (1, 1)) 

362 

363 mu = self.mean.value_as_matrix 

364 X = self._cvxopt_matrix 

365 Y = mu*cvxopt.matrix(1, (1, self.num)) 

366 Z = X - Y 

367 

368 return Constant(Z * Z.T / (self.num - 1)) 

369 

370 def shuffled(self, rng=None): 

371 """Return a randomly shuffled instance of the samples. 

372 

373 :param rng: 

374 A function that generates a random :class:`float` in :math:`[0, 1)`. 

375 Defaults to whatever :func:`random.shuffle` defaults to. 

376 

377 :Example: 

378 

379 >>> from picos.expressions import Samples 

380 >>> S = Samples(range(6)) 

381 >>> print(S.matrix) 

382 [ 0.00e+00 1.00e+00 2.00e+00 3.00e+00 4.00e+00 5.00e+00] 

383 >>> rng = lambda: 0.5 # Fake RNG for reproducibility. 

384 >>> print(S.shuffled(rng).matrix) 

385 [ 0.00e+00 5.00e+00 1.00e+00 4.00e+00 2.00e+00 3.00e+00] 

386 """ 

387 order = list(range(self.num)) 

388 random.shuffle(order, rng) 

389 

390 S = self.__class__.__new__(self.__class__) 

391 

392 if self._cached_cvx_mat is not None: 

393 S._cached_cvx_mat = self._cached_cvx_mat[:, order] 

394 

395 if self._cached_cvx_vec is not None: 

396 S._cached_cvx_vec = tuple(self._cached_cvx_vec[i] for i in order) 

397 

398 if self._cached_pic_mat is not None: 

399 # NOTE: Rename to a default string for consistency. 

400 S._cached_pic_mat = self._cached_pic_mat[:, order].renamed( 

401 glyphs.matrix(glyphs.shape(self._cached_pic_mat.shape))) 

402 

403 if self._cached_pic_vec is not None: 

404 S._cached_pic_vec = tuple(self._cached_pic_vec[i] for i in order) 

405 

406 return S 

407 

408 def partition(self, after_or_fraction=0.5): 

409 """Split the samples into two parts. 

410 

411 :param after_or_fraction: 

412 Either a fraction strictly between zero and one that denotes the 

413 relative size of the first partition or an integer that denotes the 

414 number of samples to put in the first partition. 

415 :type after_or_fraction: 

416 int or float 

417 """ 

418 if isinstance(after_or_fraction, float): 

419 if after_or_fraction <= 0 or after_or_fraction >= 1: 

420 raise ValueError( 

421 "A partitioning fraction must be strictly between 0 and 1.") 

422 

423 n = round(self.num * after_or_fraction) 

424 n = min(n, self.num - 1) 

425 n = max(1, n) 

426 else: 

427 n = int(after_or_fraction) 

428 

429 if n < 1 or n >= self.num: 

430 raise ValueError("Partitioning would leave one partition empty.") 

431 

432 A = self.__class__.__new__(self.__class__) 

433 B = self.__class__.__new__(self.__class__) 

434 

435 if self._cached_cvx_mat is not None: 

436 A._cached_cvx_mat = self._cached_cvx_mat[:, :n] 

437 B._cached_cvx_mat = self._cached_cvx_mat[:, n:] 

438 

439 if self._cached_cvx_vec is not None: 

440 A._cached_cvx_vec = self._cached_cvx_vec[:n] 

441 B._cached_cvx_vec = self._cached_cvx_vec[n:] 

442 

443 if self._cached_pic_mat is not None: 

444 A._cached_pic_mat = self._cached_pic_mat[:, :n] 

445 B._cached_pic_mat = self._cached_pic_mat[:, n:] 

446 

447 if self._cached_pic_vec is not None: 

448 A._cached_pic_vec = self._cached_pic_vec[:n] 

449 B._cached_pic_vec = self._cached_pic_vec[n:] 

450 

451 A._original_shape = self._original_shape 

452 B._original_shape = self._original_shape 

453 

454 return A, B 

455 

456 def kfold(self, k): 

457 r"""Perform :math:`k`-fold cross-validation (without shuffling). 

458 

459 If random shuffling is desired, write ``S.shuffled().kfold(k)`` where 

460 ``S`` is your :class:`Samples` instance. To make the shuffling 

461 reproducible, see :meth:`shuffled`. 

462 

463 :returns list(tuple): 

464 A list of :math:`k` training set and validation set pairs. 

465 

466 .. warning:: 

467 

468 If the number of samples :math:`n` is not a multiple of :math:`k`, 

469 then the last :math:`n \bmod k` samples will appear in every 

470 training but in no validation set. 

471 

472 :Example: 

473 

474 >>> from picos.expressions import Samples 

475 >>> n, k = 7, 3 

476 >>> S = Samples(range(n)) 

477 >>> for i, (T, V) in enumerate(S.kfold(k)): 

478 ... print("Partition {}:\nT = {}V = {}" 

479 ... .format(i + 1, T.matrix, V.matrix)) 

480 Partition 1: 

481 T = [ 2.00e+00 3.00e+00 4.00e+00 5.00e+00 6.00e+00] 

482 V = [ 0.00e+00 1.00e+00] 

483 <BLANKLINE> 

484 Partition 2: 

485 T = [ 0.00e+00 1.00e+00 4.00e+00 5.00e+00 6.00e+00] 

486 V = [ 2.00e+00 3.00e+00] 

487 <BLANKLINE> 

488 Partition 3: 

489 T = [ 0.00e+00 1.00e+00 2.00e+00 3.00e+00 6.00e+00] 

490 V = [ 4.00e+00 5.00e+00] 

491 <BLANKLINE> 

492 

493 """ 

494 if not isinstance(k, int): 

495 raise TypeError("k must be an integer.") 

496 

497 if k < 2: 

498 raise ValueError("k must be at least two.") 

499 

500 if k > self.num: 

501 raise ValueError("k must not exceed the number of samples.") 

502 

503 n = self.num // k 

504 

505 assert n >= 1 and n < self.num 

506 

507 fold = [] 

508 indices = list(range(self.num)) 

509 

510 for i in range(k): 

511 t = indices[:i*n] + indices[(i+1)*n:] 

512 v = indices[i*n:(i+1)*n] 

513 

514 T = self.__class__.__new__(self.__class__) 

515 V = self.__class__.__new__(self.__class__) 

516 

517 if self._cached_cvx_mat is not None: 

518 T._cached_cvx_mat = self._cached_cvx_mat[:, t] 

519 V._cached_cvx_mat = self._cached_cvx_mat[:, v] 

520 

521 if self._cached_cvx_vec is not None: 

522 T._cached_cvx_vec = tuple(self._cached_cvx_vec[i] for i in t) 

523 V._cached_cvx_vec = tuple(self._cached_cvx_vec[i] for i in v) 

524 

525 if self._cached_pic_mat is not None: 

526 T._cached_pic_mat = self._cached_pic_mat[:, t] 

527 V._cached_pic_mat = self._cached_pic_mat[:, v] 

528 

529 if self._cached_pic_vec is not None: 

530 T._cached_pic_vec = tuple(self._cached_pic_vec[i] for i in t) 

531 V._cached_pic_vec = tuple(self._cached_pic_vec[i] for i in v) 

532 

533 fold.append((T, V)) 

534 

535 return fold 

536 

537 def select(self, indices): 

538 """Return a new :class:`Samples` instance with only selected samples. 

539 

540 :param indices: 

541 The indices of the samples to select. 

542 """ 

543 indices = list(indices) 

544 

545 S = self.__class__.__new__(self.__class__) 

546 

547 if self._cached_cvx_mat is not None: 

548 S._cached_cvx_mat = self._cached_cvx_mat[:, indices] 

549 

550 if self._cached_cvx_vec is not None: 

551 S._cached_cvx_vec = tuple(self._cached_cvx_vec[i] for i in indices) 

552 

553 if self._cached_pic_mat is not None: 

554 S._cached_pic_mat = self._cached_pic_mat[:, indices] 

555 

556 if self._cached_pic_vec is not None: 

557 S._cached_pic_vec = tuple(self._cached_pic_vec[i] for i in indices) 

558 

559 if len(S) == 0: 

560 raise ValueError("Empty susbet of samples selected.") 

561 

562 S._original_shape = self._original_shape 

563 return S 

564 

565 

566# -------------------------------------- 

567__all__ = api_end(_API_START, globals())