Coverage for picos/expressions/samples.py: 76.23%

244 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-12 07:53 +0000

1# ------------------------------------------------------------------------------ 

2# Copyright (C) 2020 Maximilian Stahlberg 

3# 

4# This file is part of PICOS. 

5# 

6# PICOS is free software: you can redistribute it and/or modify it under the 

7# terms of the GNU General Public License as published by the Free Software 

8# Foundation, either version 3 of the License, or (at your option) any later 

9# version. 

10# 

11# PICOS is distributed in the hope that it will be useful, but WITHOUT ANY 

12# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 

13# A PARTICULAR PURPOSE. See the GNU General Public License for more details. 

14# 

15# You should have received a copy of the GNU General Public License along with 

16# this program. If not, see <http://www.gnu.org/licenses/>. 

17# ------------------------------------------------------------------------------ 

18 

19"""Implements :class:`Samples`.""" 

20 

21import random 

22 

23import cvxopt 

24 

25from .. import glyphs 

26from ..apidoc import api_end, api_start 

27from ..caching import cached_property 

28from .data import cvxopt_hcat, load_data, load_shape 

29from .exp_affine import ComplexAffineExpression, Constant 

30from .expression import Expression 

31from ..legacy import throw_deprecation_warning 

32 

33_API_START = api_start(globals()) 

34# ------------------------------- 

35 

36 

37class Samples(): 

38 """A collection of data points. 

39 

40 :Example: 

41 

42 >>> from picos.expressions import Samples 

43 >>> # Load the column-major vectorization of six matrices. 

44 >>> data = [[[1*i, 3*i], 

45 ... [2*i, 4*i]] for i in range(1, 7)] 

46 >>> S = Samples(data) 

47 >>> S 

48 <Samples: (6 4-dimensional samples)> 

49 >>> [S.num, S.dim, S.original_shape] # Metadata. 

50 [6, 4, (2, 2)] 

51 >>> S.matrix # All samples as the columns of one matrix. 

52 <4×6 Real Constant: [4×6]> 

53 >>> print(S.matrix) 

54 [ 1.00e+00 2.00e+00 3.00e+00 4.00e+00 5.00e+00 6.00e+00] 

55 [ 2.00e+00 4.00e+00 6.00e+00 8.00e+00 1.00e+01 1.20e+01] 

56 [ 3.00e+00 6.00e+00 9.00e+00 1.20e+01 1.50e+01 1.80e+01] 

57 [ 4.00e+00 8.00e+00 1.20e+01 1.60e+01 2.00e+01 2.40e+01] 

58 >>> print(S[0].T) # The first sample (transposed for brevity). 

59 [ 1.00e+00 2.00e+00 3.00e+00 4.00e+00] 

60 >>> print(S.mean.T) # The sample mean (transposed for brevity). 

61 [ 3.50e+00 7.00e+00 1.05e+01 1.40e+01] 

62 >>> print(S.covariance) # The sample covariance matrix. 

63 [ 3.50e+00 7.00e+00 1.05e+01 1.40e+01] 

64 [ 7.00e+00 1.40e+01 2.10e+01 2.80e+01] 

65 [ 1.05e+01 2.10e+01 3.15e+01 4.20e+01] 

66 [ 1.40e+01 2.80e+01 4.20e+01 5.60e+01] 

67 >>> print(S.original[0]) # The first sample in its original shape. 

68 [ 1.00e+00 3.00e+00] 

69 [ 2.00e+00 4.00e+00] 

70 >>> U = S.select([0, 2, 4]) # Select a subset of samples by indices. 

71 >>> print(U.matrix) 

72 [ 1.00e+00 3.00e+00 5.00e+00] 

73 [ 2.00e+00 6.00e+00 1.00e+01] 

74 [ 3.00e+00 9.00e+00 1.50e+01] 

75 [ 4.00e+00 1.20e+01 2.00e+01] 

76 >>> T, V = S.partition() # Split into training and validation samples. 

77 >>> print(T.matrix) 

78 [ 1.00e+00 2.00e+00 3.00e+00] 

79 [ 2.00e+00 4.00e+00 6.00e+00] 

80 [ 3.00e+00 6.00e+00 9.00e+00] 

81 [ 4.00e+00 8.00e+00 1.20e+01] 

82 >>> print(V.matrix) 

83 [ 4.00e+00 5.00e+00 6.00e+00] 

84 [ 8.00e+00 1.00e+01 1.20e+01] 

85 [ 1.20e+01 1.50e+01 1.80e+01] 

86 [ 1.60e+01 2.00e+01 2.40e+01] 

87 """ 

88 

89 def __new__(cls, samples=None, forced_original_shape=None, **kwargs): 

90 """Prepare a :class:`Samples` instance.""" 

91 if isinstance(samples, cls): 

92 if forced_original_shape is not None: 

93 forced_shape = load_shape(forced_original_shape) 

94 if forced_shape[0]*forced_shape[1] != samples.dim: 

95 raise ValueError("Incompatible forced original shape.") 

96 

97 if forced_shape == samples.original_shape: 

98 # Shapes are consistent, return the existing instance. 

99 return samples 

100 else: 

101 # Make a shallow copy and change only the original shape. 

102 self = object.__new__(cls) 

103 self._cached_cvx_mat = samples._cached_cvx_mat 

104 self._cached_cvx_vec = samples._cached_cvx_vec 

105 self._cached_pic_mat = samples._cached_pic_mat 

106 self._cached_pic_vec = samples._cached_pic_vec 

107 self._original_shape = forced_shape 

108 return self 

109 else: 

110 # Return the existing instance. 

111 return samples 

112 else: 

113 # Return a new instance. 

114 self = object.__new__(cls) 

115 self._cached_cvx_mat = None 

116 self._cached_cvx_vec = None 

117 self._cached_pic_mat = None 

118 self._cached_pic_vec = None 

119 self._original_shape = None 

120 return self 

121 

122 def __init__(self, samples, forced_original_shape=None, always_copy=True): 

123 """Load a number of data points (samples). 

124 

125 :param samples: 

126 Any of the following: 

127 

128 - A tuple or list of constants, each of which denotes a sample 

129 vector. Matrices are vectorized but their :attr:`original_shape` 

130 is stored and may be used by PICOS internally. 

131 - A constant row or column vector whose entries denote scalar 

132 samples. 

133 - A constant matrix whose columns denote the samples. 

134 - Another :class:`Samples` instance. If possible, it is returned as 

135 is (:class:`Samples` instances are immutable), otherwise a shallow 

136 copy with the necessary modifications is returned instead. 

137 

138 In any case, constants may be given as constant numeric data values 

139 (anything recognized by :func:`~.data.load_data`) or as constant 

140 PICOS expressions. 

141 

142 :param forced_original_shape: 

143 Overwrites :attr:`original_shape` with the given shape. 

144 

145 :param bool always_copy: 

146 If this is :obj:`False`, then data that is provided in the form of 

147 CVXOPT types is not copied but referenced if possible. This can 

148 speed up instance creation but will introduce inconsistencies if the 

149 original data is modified. Note that this argument has no impact if 

150 the ``samples`` argument already is a :class:`Samples` instance; in 

151 this case data is never copied. 

152 """ 

153 if isinstance(samples, Samples): 

154 # Handled in __new__. 

155 return 

156 elif isinstance(samples, (tuple, list)): 

157 if not samples: 

158 raise ValueError("Need at least one sample.") 

159 

160 if all(isinstance(s, (int, float, complex)) for s in samples): 

161 # Efficiently handle a list of scalars. 

162 self._cached_cvx_mat = load_data(samples)[0].T 

163 elif all(isinstance(s, ComplexAffineExpression) 

164 and s.constant for s in samples): 

165 if len(set(s.shape for s in samples)) != 1: 

166 raise ValueError("Cannot load samples of differing shapes.") 

167 

168 self._original_shape = samples[0].shape 

169 self._cached_pic_vec = tuple(s.vec for s in samples) 

170 else: 

171 samples = tuple( 

172 load_data(s, alwaysCopy=always_copy)[0] for s in samples) 

173 

174 if len(set(s.size for s in samples)) != 1: 

175 raise ValueError("Cannot load samples of differing shapes.") 

176 

177 self._original_shape = samples[0].size 

178 self._cached_cvx_vec = tuple( 

179 s if s.size[1] == 1 else s[:] for s in samples) 

180 elif isinstance(samples, Expression): 

181 samples = samples.refined 

182 

183 if not isinstance(samples, ComplexAffineExpression): 

184 raise TypeError("Can only extract samples from a (constant) " 

185 "affine expression, not from an instance of {}." 

186 .format(type(samples).__name__)) 

187 

188 if not samples.constant: 

189 raise TypeError("Can only extract samples from a constant " 

190 "expression, {} is not constant.".format(samples.string)) 

191 

192 self._cached_pic_mat = samples 

193 

194 # Treat any vector as a number of scalar samples. 

195 if self._cached_pic_mat.shape[1] == 1: 

196 self._cached_pic_mat = self._cached_pic_mat.T 

197 else: 

198 self._cached_cvx_mat = load_data(samples, alwaysCopy=always_copy)[0] 

199 

200 # Treat any vector as a number of scalar samples. 

201 if self._cached_cvx_mat.size[1] == 1: 

202 self._cached_cvx_mat = self._cached_cvx_mat.T 

203 

204 assert any(samples is not None for samples in ( 

205 self._cached_cvx_vec, 

206 self._cached_pic_vec, 

207 self._cached_cvx_mat, 

208 self._cached_pic_mat)) 

209 

210 if forced_original_shape is not None: 

211 forced_shape = load_shape(forced_original_shape) 

212 if forced_shape[0]*forced_shape[1] != self.dim: 

213 raise ValueError("Incompatible forced original shape.") 

214 self._original_shape = forced_shape 

215 

216 def __len__(self): 

217 """Number of samples.""" 

218 return self.num 

219 

220 def __str__(self): 

221 return glyphs.parenth( 

222 "{} {}-dimensional samples".format(self.num, self.dim)) 

223 

224 def __repr__(self): 

225 return glyphs.repr2("Samples", self.__str__()) 

226 

227 def __getitem__(self, i): 

228 return self.vectors[i] 

229 

230 def __iter__(self): 

231 for vector in self.vectors: 

232 yield vector 

233 

234 @property 

235 def _cvxopt_matrix(self): 

236 """A CVXOPT dense or sparse matrix whose columns are the samples. 

237 

238 This cached property is used by PICOS internally as accessing the CVXOPT 

239 value of a constant PICOS expression would create a copy of the data. 

240 

241 .. warning:: 

242 

243 :class:`Sample` instances are supposed to be immutable, so you are 

244 expected not to modify the returned CVXOPT objects. 

245 """ 

246 if self._cached_cvx_mat is not None: 

247 pass 

248 elif self._cached_pic_mat is not None: 

249 self._cached_cvx_mat = self._cached_pic_mat.value_as_matrix 

250 elif self._cached_cvx_vec is not None: 

251 self._cached_cvx_mat = cvxopt_hcat(self._cached_cvx_vec) 

252 else: 

253 self._cached_cvx_mat = cvxopt_hcat( 

254 [s.value_as_matrix for s in self._cached_pic_vec]) 

255 

256 return self._cached_cvx_mat 

257 

258 @property 

259 def _cvxopt_vectors(self): 

260 """A :class:`tuple` containing the samples as CVXOPT column vectors. 

261 

262 This cached property is used by PICOS internally as accessing the CVXOPT 

263 value of a constant PICOS expression would create a copy of the data. 

264 

265 .. warning:: 

266 

267 :class:`Sample` instances are supposed to be immutable, so you are 

268 expected not to modify the returned CVXOPT objects. 

269 """ 

270 if self._cached_cvx_vec is not None: 

271 pass 

272 elif self._cached_cvx_mat is not None: 

273 self._cached_cvx_vec = tuple(self._cached_cvx_mat[:, i] 

274 for i in range(self._cached_cvx_mat.size[1])) 

275 elif self._cached_pic_vec is not None: 

276 self._cached_cvx_vec = tuple( 

277 s.value_as_matrix for s in self._cached_pic_vec) 

278 else: 

279 # We need to convert from a PICOS to a CVXOPT matrix, do so in a way 

280 # that caches the result. 

281 _ = self._cvxopt_matrix 

282 assert self._cached_cvx_mat is not None 

283 

284 self._cached_cvx_vec = tuple(self._cached_cvx_mat[:, i] 

285 for i in range(self._cached_cvx_mat.size[1])) 

286 

287 return self._cached_cvx_vec 

288 

289 @property 

290 def matrix(self): 

291 """A matrix whose columns are the samples.""" 

292 if self._cached_pic_mat is not None: 

293 pass 

294 else: 

295 self._cached_pic_mat = Constant(self._cvxopt_matrix) 

296 

297 return self._cached_pic_mat 

298 

299 @property 

300 def vectors(self): 

301 """A :class:`tuple` containing the samples as column vectors.""" 

302 if self._cached_pic_vec is not None: 

303 pass 

304 else: 

305 self._cached_pic_vec = tuple( 

306 Constant(s) for s in self._cvxopt_vectors) 

307 

308 return self._cached_pic_vec 

309 

310 @cached_property 

311 def original(self): 

312 """A :class:`tuple` containing the samples in their original shape.""" 

313 shape = self.original_shape 

314 

315 if shape[1] == 1: 

316 return self.vectors 

317 else: 

318 return tuple(sample.reshaped(shape) for sample in self) 

319 

320 @property 

321 def dim(self): 

322 """Sample dimension.""" 

323 if self._cached_cvx_mat is not None: 

324 return self._cached_cvx_mat.size[0] 

325 elif self._cached_pic_mat is not None: 

326 return self._cached_pic_mat.shape[0] 

327 elif self._cached_cvx_vec is not None: 

328 # NOTE: len() counts nonzero entries for sparse matrices. 

329 return self._cached_cvx_vec[0].size[0] 

330 else: 

331 return len(self._cached_pic_vec[0]) 

332 

333 @property 

334 def num(self): 

335 """Number of samples.""" 

336 if self._cached_cvx_mat is not None: 

337 return self._cached_cvx_mat.size[1] 

338 elif self._cached_pic_mat is not None: 

339 return self._cached_pic_mat.shape[1] 

340 elif self._cached_cvx_vec is not None: 

341 return len(self._cached_cvx_vec) 

342 else: 

343 return len(self._cached_pic_vec) 

344 

345 @property 

346 def original_shape(self): 

347 """Original shape of the samples before vectorization.""" 

348 if self._original_shape is None: 

349 self._original_shape = (self.dim, 1) 

350 

351 return self._original_shape 

352 

353 @cached_property 

354 def mean(self): 

355 """The sample mean as a column vector.""" 

356 return Constant(sum(self._cvxopt_vectors) / self.num) 

357 

358 @cached_property 

359 def covariance(self): 

360 """The sample covariance matrix.""" 

361 if self.num == 1: 

362 return cvxopt.spmatrix([], [], [], (1, 1)) 

363 

364 mu = self.mean.value_as_matrix 

365 X = self._cvxopt_matrix 

366 Y = mu*cvxopt.matrix(1, (1, self.num)) 

367 Z = X - Y 

368 

369 return Constant(Z * Z.T / (self.num - 1)) 

370 

371 def shuffled(self, rng=None): 

372 """Return a randomly shuffled instance of the samples. 

373 

374 :param rng: DEPRECATED 

375 

376 :Example: 

377 

378 >>> from picos.expressions import Samples 

379 >>> S = Samples(range(6)) 

380 >>> print(S.matrix) 

381 [ 0.00e+00 1.00e+00 2.00e+00 3.00e+00 4.00e+00 5.00e+00] 

382 >>> import random 

383 >>> random.seed(42) # Fix seed for reproducibility. 

384 >>> print(S.shuffled().matrix) 

385 [ 3.00e+00 1.00e+00 2.00e+00 4.00e+00 0.00e+00 5.00e+00] 

386 """ 

387 # Handle deprecated 'rng' parameter. 

388 if rng is not None: 

389 throw_deprecation_warning( 

390 "Argument 'rng' is deprecated and ignored.") 

391 

392 order = list(range(self.num)) 

393 random.shuffle(order) 

394 

395 S = self.__class__.__new__(self.__class__) 

396 

397 if self._cached_cvx_mat is not None: 

398 S._cached_cvx_mat = self._cached_cvx_mat[:, order] 

399 

400 if self._cached_cvx_vec is not None: 

401 S._cached_cvx_vec = tuple(self._cached_cvx_vec[i] for i in order) 

402 

403 if self._cached_pic_mat is not None: 

404 # NOTE: Rename to a default string for consistency. 

405 S._cached_pic_mat = self._cached_pic_mat[:, order].renamed( 

406 glyphs.matrix(glyphs.shape(self._cached_pic_mat.shape))) 

407 

408 if self._cached_pic_vec is not None: 

409 S._cached_pic_vec = tuple(self._cached_pic_vec[i] for i in order) 

410 

411 return S 

412 

413 def partition(self, after_or_fraction=0.5): 

414 """Split the samples into two parts. 

415 

416 :param after_or_fraction: 

417 Either a fraction strictly between zero and one that denotes the 

418 relative size of the first partition or an integer that denotes the 

419 number of samples to put in the first partition. 

420 :type after_or_fraction: 

421 int or float 

422 """ 

423 if isinstance(after_or_fraction, float): 

424 if after_or_fraction <= 0 or after_or_fraction >= 1: 

425 raise ValueError( 

426 "A partitioning fraction must be strictly between 0 and 1.") 

427 

428 n = round(self.num * after_or_fraction) 

429 n = min(n, self.num - 1) 

430 n = max(1, n) 

431 else: 

432 n = int(after_or_fraction) 

433 

434 if n < 1 or n >= self.num: 

435 raise ValueError("Partitioning would leave one partition empty.") 

436 

437 A = self.__class__.__new__(self.__class__) 

438 B = self.__class__.__new__(self.__class__) 

439 

440 if self._cached_cvx_mat is not None: 

441 A._cached_cvx_mat = self._cached_cvx_mat[:, :n] 

442 B._cached_cvx_mat = self._cached_cvx_mat[:, n:] 

443 

444 if self._cached_cvx_vec is not None: 

445 A._cached_cvx_vec = self._cached_cvx_vec[:n] 

446 B._cached_cvx_vec = self._cached_cvx_vec[n:] 

447 

448 if self._cached_pic_mat is not None: 

449 A._cached_pic_mat = self._cached_pic_mat[:, :n] 

450 B._cached_pic_mat = self._cached_pic_mat[:, n:] 

451 

452 if self._cached_pic_vec is not None: 

453 A._cached_pic_vec = self._cached_pic_vec[:n] 

454 B._cached_pic_vec = self._cached_pic_vec[n:] 

455 

456 A._original_shape = self._original_shape 

457 B._original_shape = self._original_shape 

458 

459 return A, B 

460 

461 def kfold(self, k): 

462 r"""Perform :math:`k`-fold cross-validation (without shuffling). 

463 

464 If random shuffling is desired, write ``S.shuffled().kfold(k)`` where 

465 ``S`` is your :class:`Samples` instance. To make the shuffling 

466 reproducible, see :meth:`shuffled`. 

467 

468 :returns list(tuple): 

469 A list of :math:`k` training set and validation set pairs. 

470 

471 .. warning:: 

472 

473 If the number of samples :math:`n` is not a multiple of :math:`k`, 

474 then the last :math:`n \bmod k` samples will appear in every 

475 training but in no validation set. 

476 

477 :Example: 

478 

479 >>> from picos.expressions import Samples 

480 >>> n, k = 7, 3 

481 >>> S = Samples(range(n)) 

482 >>> for i, (T, V) in enumerate(S.kfold(k)): 

483 ... print("Partition {}:\nT = {}V = {}" 

484 ... .format(i + 1, T.matrix, V.matrix)) 

485 Partition 1: 

486 T = [ 2.00e+00 3.00e+00 4.00e+00 5.00e+00 6.00e+00] 

487 V = [ 0.00e+00 1.00e+00] 

488 <BLANKLINE> 

489 Partition 2: 

490 T = [ 0.00e+00 1.00e+00 4.00e+00 5.00e+00 6.00e+00] 

491 V = [ 2.00e+00 3.00e+00] 

492 <BLANKLINE> 

493 Partition 3: 

494 T = [ 0.00e+00 1.00e+00 2.00e+00 3.00e+00 6.00e+00] 

495 V = [ 4.00e+00 5.00e+00] 

496 <BLANKLINE> 

497 

498 """ 

499 if not isinstance(k, int): 

500 raise TypeError("k must be an integer.") 

501 

502 if k < 2: 

503 raise ValueError("k must be at least two.") 

504 

505 if k > self.num: 

506 raise ValueError("k must not exceed the number of samples.") 

507 

508 n = self.num // k 

509 

510 assert n >= 1 and n < self.num 

511 

512 fold = [] 

513 indices = list(range(self.num)) 

514 

515 for i in range(k): 

516 t = indices[:i*n] + indices[(i+1)*n:] 

517 v = indices[i*n:(i+1)*n] 

518 

519 T = self.__class__.__new__(self.__class__) 

520 V = self.__class__.__new__(self.__class__) 

521 

522 if self._cached_cvx_mat is not None: 

523 T._cached_cvx_mat = self._cached_cvx_mat[:, t] 

524 V._cached_cvx_mat = self._cached_cvx_mat[:, v] 

525 

526 if self._cached_cvx_vec is not None: 

527 T._cached_cvx_vec = tuple(self._cached_cvx_vec[i] for i in t) 

528 V._cached_cvx_vec = tuple(self._cached_cvx_vec[i] for i in v) 

529 

530 if self._cached_pic_mat is not None: 

531 T._cached_pic_mat = self._cached_pic_mat[:, t] 

532 V._cached_pic_mat = self._cached_pic_mat[:, v] 

533 

534 if self._cached_pic_vec is not None: 

535 T._cached_pic_vec = tuple(self._cached_pic_vec[i] for i in t) 

536 V._cached_pic_vec = tuple(self._cached_pic_vec[i] for i in v) 

537 

538 fold.append((T, V)) 

539 

540 return fold 

541 

542 def select(self, indices): 

543 """Return a new :class:`Samples` instance with only selected samples. 

544 

545 :param indices: 

546 The indices of the samples to select. 

547 """ 

548 indices = list(indices) 

549 

550 S = self.__class__.__new__(self.__class__) 

551 

552 if self._cached_cvx_mat is not None: 

553 S._cached_cvx_mat = self._cached_cvx_mat[:, indices] 

554 

555 if self._cached_cvx_vec is not None: 

556 S._cached_cvx_vec = tuple(self._cached_cvx_vec[i] for i in indices) 

557 

558 if self._cached_pic_mat is not None: 

559 S._cached_pic_mat = self._cached_pic_mat[:, indices] 

560 

561 if self._cached_pic_vec is not None: 

562 S._cached_pic_vec = tuple(self._cached_pic_vec[i] for i in indices) 

563 

564 if len(S) == 0: 

565 raise ValueError("Empty susbet of samples selected.") 

566 

567 S._original_shape = self._original_shape 

568 return S 

569 

570 

571# -------------------------------------- 

572__all__ = api_end(_API_START, globals())