#!/usr/bin/env python3
# coding: utf-8
"""Chromosight's preprocessing submodule implements a number of functions to
operate on Hi-C contact maps before detection. These functions can be used to
improve the signal or filter unneeded signal. There are also functions to edit
(zoom, crop, factorize) kernel matrices.
"""
import sys
import numpy as np
import numpy.linalg as la
import scipy.stats as ss
import scipy.sparse as sp
import scipy.ndimage as ndi
from sklearn.isotonic import IsotonicRegression
[docs]def erase_missing(signal, valid_rows, valid_cols, sym_upper=True):
"""
Given a sparse matrix, set all pixels in missing (invalid) bins to 0.
Parameters
----------
signal : scipy.sparse.csr_matrix of floats
Input signal on which to erase values.
valid_rows : numpy.ndarray of ints
Indices of rows considered valid (not missing).
valid_cols : numpy.ndarray of ints
Indices of columns considered valid (not missing).
sym_upper : bool
Define if the input signal is upper symmetric.
Returns
-------
scipy.sparse.csr_matrix
The input signal with all values in missing bins set to 0
"""
if sym_upper and sp.issparse(signal):
if np.any(valid_rows != valid_cols):
raise ValueError(
"Valid rows and columns must be identical with sym_upper=True"
)
if signal.shape[0] != signal.shape[1]:
raise ValueError(
"Input matrix must be square when using sym_upper=True"
)
# Make a boolean mask from good bins
good_mask = np.isin(range(signal.shape[0]), valid_rows)
# Set all pixels in a nondetectable bin to 0
# For faster masking of bins, mask bins using dot product with an
# identify matrix where bad bins have been masked on the diagonal
# E.g. if removing the second bin (row and column):
# 1 0 0 9 6 5 1 0 0 9 0 5
# 0 0 0 X 6 8 7 X 0 0 0 = 0 0 0
# 0 0 1 6 7 8 0 0 1 6 0 8
mask_mat = sp.eye(signal.shape[0])
mask_mat.data[0][~good_mask] = 0
erased = mask_mat.dot(signal).dot(mask_mat)
else:
# Get a boolean array of missing (1) and valid (0) rows
missing_rows = valid_to_missing(valid_rows, signal.shape[0])
missing_cols = valid_to_missing(valid_cols, signal.shape[1])
erased = signal.copy()
erased[missing_rows, :] = 0
erased[:, missing_cols] = 0
return erased
[docs]def set_mat_diag(mat, diag=0, val=0):
"""
Set the nth diagonal of a symmetric 2D numpy array to a fixed value.
Operates in place.
Parameters
----------
mat : numpy.ndarray
Symmetric 2D array of floats.
diag : int
0-based index of the diagonal to modify. Use negative values for the
lower half.
val : float
Value to use for filling the diagonal
"""
m = mat.shape[0]
step = m + 1
start = diag
end = m ** 2 - diag * m
mat.flat[start:end:step] = val
[docs]def diag_trim(mat, n):
"""
Trim an upper triangle sparse matrix so that only the first n diagonals
are kept.
Parameters
----------
mat : scipy.sparse.csr_matrix or numpy.ndarray
The sparse matrix to be trimmed
n : int
The number of diagonals from the center to keep (0-based).
Returns
-------
scipy.sparse.dia_matrix or numpy.ndarray:
The diagonally trimmed upper triangle matrix with only the first
n diagonal.
"""
if sp.issparse(mat):
if mat.format != "csr":
raise ValueError("input type must be scipy.sparse.csr_matrix")
# Trim diagonals by removing all elements further than n in the
# upper triangle
trimmed = sp.tril(mat, n, format="csr")
trimmed = sp.triu(trimmed, format="csr")
else:
trimmed = mat.copy()
n_diags = trimmed.shape[0]
for diag in range(n, n_diags):
set_mat_diag(trimmed, diag, 0)
return trimmed
return trimmed
[docs]def distance_law(
matrix, detectable_bins=None, max_dist=None, smooth=True, fun=np.nanmean
):
"""
Computes genomic distance law by averaging over each diagonal in the upper
triangle matrix. If a list of detectable bins is provided, pixels in
missing bins will be excluded from the averages. A maximum distance can be
specified to define how many diagonals should be computed.
parameters
----------
matrix: scipy.sparse.csr_matrix
the input matrix to compute distance law from.
detectable_bins : numpy.ndarray of ints
An array of detectable bins indices to consider when computing
distance law.
max_dist : int
Maximum distance from diagonal, in number of bins in which to compute
distance law
smooth : bool
Whether to use isotonic regression to smooth the distance law.
fun : callable
A function to apply on each diagonal. Defaults to mean.
Returns
-------
dist: np.ndarray
the output genomic distance law.
example
-------
>>> m = np.ones((3,3))
>>> m += np.array([1,2,3])
>>> m
array([[2., 3., 4.],
[2., 3., 4.],
[2., 3., 4.]])
>>> distance_law(csr_matrix(m))
array([3. , 3.5, 4. ])
"""
mat_n = matrix.shape[0]
if max_dist is None:
max_dist = mat_n
n_diags = min(mat_n, max_dist + 1)
dist = np.zeros(mat_n)
if detectable_bins is None:
detectable_bins = np.array(range(mat_n))
for diag in range(n_diags):
# Find detectable which fall in diagonal
detect_mask = np.zeros(mat_n, dtype=bool)
detect_mask[detectable_bins] = 1
# Find bins which are detectable in the diagonal (intersect of
# hori and verti)
detect_mask_h = detect_mask[: (mat_n - diag)]
detect_mask_v = detect_mask[mat_n - (mat_n - diag) :]
detect_mask_diag = detect_mask_h & detect_mask_v
detect_diag = matrix.diagonal(diag)[detect_mask_diag]
dist[diag] = fun(detect_diag[detect_diag > 0])
# Smooth the curve using isotonic regression: Find closest approximation
# with the condition that point n+1 cannot be higher than point n.
# (i.e. contacts can only decrease when increasing distance)
if smooth and mat_n > 2:
ir = IsotonicRegression(increasing=False)
dist[~np.isfinite(dist)] = 0
dist = ir.fit_transform(range(len(dist)), dist)
return dist
[docs]def get_detectable_bins(mat, n_mads=3, inter=False):
"""
Returns lists of detectable indices after excluding low interacting bins
based on the proportion of zero pixel values in the matrix bins.
Parameters
----------
mat : scipy.sparse.coo_matrix
A Hi-C matrix in tihe form of a 2D numpy array or coo matrix
n_mads : int
Number of median absolute deviation below the median required to
consider bins non-detectable.
inter : bool
Whether the matrix is interchromosomal. Default is to consider the
matrix is intrachromosomal (i.e. upper symmetric).
Returns
-------
numpy ndarray :
tuple of 2 1D arrays containing indices of low interacting rows and
columns, respectively.
"""
matrix = mat.copy()
matrix.eliminate_zeros()
def mad(x): return ss.median_abs_deviation(x, nan_policy="omit")
if not inter:
if matrix.shape[0] != matrix.shape[1]:
raise ValueError("Intrachromosomal matrices must be symmetric.")
# Replace nonzero pixels by ones to work on prop. of nonzero pixels
matrix.data = np.ones(matrix.data.shape)
# Compute number of nonzero values in each bin
sum_bins = sum_mat_bins(matrix)
# Compute variation in the number of nonzero pixels
sum_mad = mad(sum_bins)
# Find poor interacting rows and columns
sum_med = np.median(sum_bins)
detect_threshold = max(1, sum_med - sum_mad * n_mads)
# Removal of poor interacting rows and columns
good_bins = np.flatnonzero(sum_bins >= detect_threshold)
good_bins = (good_bins, good_bins)
else:
# Adapted for asymetric matrices (need to compute rows and columns)
sum_rows, sum_cols = matrix.sum(axis=1).A1, matrix.sum(axis=0).A1
mad_rows, mad_cols = mad(sum_rows), mad(sum_cols)
med_rows, med_cols = np.median(sum_rows), np.median(sum_cols)
detect_threshold_rows = max(1, med_rows - mad_rows * n_mads)
detect_threshold_cols = max(1, med_cols - mad_cols * n_mads)
good_rows = np.flatnonzero(sum_rows > detect_threshold_rows)
good_cols = np.flatnonzero(sum_cols > detect_threshold_cols)
good_bins = (good_rows, good_cols)
return good_bins
[docs]def detrend(
matrix,
detectable_bins=None,
max_dist=None,
smooth=False,
fun=np.nanmean,
max_val=10,
):
"""
Detrends a Hi-C matrix by the distance law.
The input matrix should have been normalised beforehandand.
Parameters
----------
matrix : scipy.sparse.csr_matrix
The normalised intrachromosomal Hi-C matrix to detrend.
detectable_bins : tuple
Tuple containing a list of detectable rows and a list of columns on
which to perform detrending. Poorly interacting indices have been
excluded.
max_dist : int
Maximum number of bins from the diagonal at which to compute trend.
smooth : bool
Whether to use isotonic regression to smooth the trend.
fun : callable
Function to use on each diagonal to compute the trend.
max_val : float or None
Maximum value in the detrended matrix. Set to None to disable
Returns
-------
numpy.ndarray :
The detrended Hi-C matrix.
"""
matrix = matrix.tocsr()
y = distance_law(
matrix,
detectable_bins=detectable_bins,
max_dist=max_dist,
smooth=smooth,
fun=fun,
)
y[np.isnan(y)] = 0.0
# Detrending by the distance law
clean_mat = matrix.tocoo()
# clean_mat.data /= y_savgol[abs(clean_mat.row - clean_mat.col)]
try:
clean_mat.data = clean_mat.data / y[abs(clean_mat.row - clean_mat.col)]
# If no nonzero value in matrix, do nothing
except TypeError:
pass
clean_mat = clean_mat.tocsr()
if max_val is not None:
clean_mat[clean_mat >= max_val] = 1
return clean_mat
[docs]def sum_mat_bins(mat):
"""
Compute the sum of matrices bins (i.e. rows or columns) using
only the upper triangle, assuming symmetrical matrices.
Parameters
----------
mat : scipy.sparse.coo_matrix
Contact map in sparse format, either in upper triangle or
full matrix.
Returns
-------
numpy.ndarray :
1D array of bin sums.
"""
# Equivalaent to row or col sum on a full matrix
# Note: mat.sum returns a 'matrix' object. A1 extracts the 1D flat array
# from the matrix
return mat.sum(axis=0).A1 + mat.sum(axis=1).A1 - mat.diagonal(0)
[docs]def frame_missing_mask(mask, kernel_shape, sym_upper=False, max_dist=None):
"""
Adds a frame around input mask, given a kernel. The goal of this
frame is define margins around the matrix where the kernel will not perform
convolution (denoted by 1). If the matrix is upper symmetric, a margin of
half the kernel's width is added below the diagonal and a maximum distance
from the diagonal above which margins need not be drawn can be considered.
Otherwise Margins are simply added on all 4 sides of the matrix.
::
signal kernel _________
______ ____ |#######|
| | | | ==> |# #|
| | |___| |# #|
| | |# #|
|_____| |# #|
|#######|
--------
Parameters
----------
mask : scipy.sparse.csr_matrix of bool
The mask around which to add margins.
kernels_shape : tuple of ints
The number of rows and kernel in the input kernel. Margins will be half
these values.
sym_upper : bool
Whether the signal is a symmetric upper triangle matrix. If so, values
on a margin below the diagonal will be masked.
max_dist : int or None
Number of diagonals to keep
Returns
-------
framed_mask : scipy.sparse.csr_matrix of bool
The input mask with a padding of True around the edges. If sym_upper
is True, a padding is also added below the diagonal.
"""
if mask.dtype != bool:
raise ValueError("Mask must contain boolean values")
if not sp.issparse(mask):
raise ValueError("Mask must be a sparse matrix")
framed_mask = mask.copy()
ms, ns = mask.shape
mk, nk = kernel_shape
if sym_upper and (max_dist is not None):
# Remove diagonals further than scan distance in the input mask
framed_mask = diag_trim(framed_mask, max_dist + max(nk, mk)).tocsr()
max_m = max_dist + mk
max_n = max_dist + nk
else:
max_m, max_n = ms, ns
# Up and down margins initialized with zeros and filled as needed
margin_1 = sp.csr_matrix((mk - 1, ns), dtype=bool)
margin_2 = sp.csr_matrix((mk - 1, ns), dtype=bool)
if sym_upper and (max_dist is not None):
# Margin 1 (top) is in upper triangle -> fill missing up to scan dist
margin_1[:, :max_n] = 1
else:
margin_1[:, :] = 1
margin_2[:, :] = 1
framed_mask = sp.vstack([margin_1, framed_mask, margin_2], format="csr")
# Left and right
margin_1 = sp.csr_matrix((ms + 2 * (mk - 1), nk - 1), dtype=bool)
margin_2 = sp.csr_matrix((ms + 2 * (mk - 1), nk - 1), dtype=bool)
if sym_upper and (max_dist is not None):
# Margin 2 (right) is in upper triangle-> fill missing up to scan dist
margin_2[-(max_m + 1) :, :] = 1
# Fill only the start of left margin for the top-left corner
margin_1[: mk - 1, :] = 1
else:
margin_1[:, :] = 1
margin_2[:, :] = 1
framed_mask = sp.hstack([margin_1, framed_mask, margin_2], format="csr")
if sym_upper:
# LIL format is much faster when changing sparsity
framed_mask = framed_mask.tolil()
# Add margin below diagonal
big_k = max(nk, mk)
dia_margins = np.ones(big_k)
dia_offsets = np.arange(-1, -big_k-1, -1)
framed_mask += sp.diags(
dia_margins,
dia_offsets,
shape=framed_mask.shape,
format="lil",
dtype=bool,
)
framed_mask = framed_mask.tocsr()
return framed_mask
[docs]def check_missing_mask(signal, mask):
"""
Ensure all elements defined as missing by the mask are set to zero in the
signal. If this is not the case, raises an error.
Parameters
----------
signal : numpy.ndarray of floats or scipy.sparse.csr_matrix of floats
The signal to be checked.
mask : numpy.ndarray of bools or scipy.sparse.csr_matrix of bools
The mask defining missing values as True and valid values as False.
"""
if sp.issparse(mask):
# Check if there are nonzero values in the signal reported as missing
# by the mask
missing_with_signal = np.nonzero(
abs(signal[mask.nonzero()[0], mask.nonzero()[1]]) > 0
)[0]
if len(missing_with_signal) > 0:
raise ValueError(
"There are",
len(missing_with_signal),
"non-zero elements reported as missing.",
)
else:
if np.sum(abs(signal[mask > 0])) > 1e-10:
raise ValueError(
"There are",
str(np.sum(abs(signal[mask > 0]))),
"non-zero elements reported as missing.",
)
[docs]def make_missing_mask(
shape, valid_rows, valid_cols, max_dist=None, sym_upper=False
):
"""
Given lists of valid rows and columns, generate a sparse matrix mask with
missing pixels denoted as 1 and valid pixels as 0. If a max_dist is
provided, upper symmetric matrices will only be flagged up to max_dist
pixels from the diagonal.
Parameters
----------
shape : tuple of ints
Shape of the mask to generate.
valid_rows : numpy.ndarray of ints
Array with the indices of valid rows that should be set to 0 in the
mask.
valid_cols : numpy.ndarray of ints
Array with the indices of valid rows that should be set to 0 in the
mask.
max_dist : int or None
The maximum diagonal distance at which masking should take place.
sym_upper : bool
Whether the matrix is symmetric upper. If so, max_dist is ignored
Returns
-------
scipy.sparse.csr_matrix of bool
The mask containing False values where pixels are valid and True valid
where pixels are missing
"""
# Error if the matrix upper symmetric but shape is rectangle or missing
# rows and cols are different
sm, sn = shape
if sym_upper and (sm != sn or len(valid_rows) != len(valid_cols)):
raise ValueError("Rectangular matrices cannot be upper symmetric")
# Get a boolean array of missing (1) and valid (0) rows
missing_rows = valid_to_missing(valid_rows, sm)
# When matrix is sym., rows and cols are synonym, no need to compute 2x
if sym_upper:
missing_cols = missing_rows
else:
missing_cols = valid_to_missing(valid_cols, sn)
# If upper sym., fill only upper diag up to max_dist.
# E. g. with bins 1 and 3 missing
# and a max_dist of 1:
# 0 1 0 0 0
# 0 1 1 0 0
# 0 0 0 1 0
# 0 0 0 1 1
# 0 0 0 0 0
# For each missing bin, mask is apply 1 pixel upwards and 1 to the right
# to fill only the upper triangle up to max_dist
if sym_upper:
# If no max dist has been specified, fill the whole upper triangle
if max_dist is None:
max_dist = min(shape)
# Generate matrix of distance shifts by row.
# Shape is len(missing_rows) x (max_dist + 1)
# e.g.: 2 missing rows and max dist of 1
# 0 0
# 1 1
row_shifts = np.tile(
np.array(range(max_dist + 1)), (len(missing_rows), 1)
).T
# Compute row positions upwards to diagonal by subtracting missing rows
# to the shifts. Following the previous example, if missing rows are
# bins 1 and 3:
# 1 3
# 0 2
rows_before = (missing_rows - row_shifts).flatten("F")
# looking at pixels up from the bins, cols remain the same:
# 1 3
# 1 3
cols_before = np.repeat(missing_rows, max_dist+1)
# Compute col position to the right until diagonal by adding the shift
# Note: upper symmetric, so row_shifts = col_shift_
# 1 3
# 2 4
cols_after = (missing_cols + row_shifts).flatten("F")
# This time, rows remain constant since we are computing positions to
# the right
rows_after = np.repeat(missing_cols, max_dist+1)
# Combine positions to the right and upwards
rows = np.concatenate([rows_before, rows_after])
cols = np.concatenate([cols_before, cols_after])
data = np.ones(rows.shape, dtype=bool)
# Remove entries where rows or cols are negative or larger than shape
valid = (cols < sm) & (cols >= 0) & (rows < sm) & (rows >= 0)
# Build mask mat with miss bins up to max scan dist in upper triangle
mask = sp.coo_matrix(
(data[valid], (rows[valid], cols[valid])), shape=shape, dtype=bool
).tocsr()
else:
mask = sp.csr_matrix(shape, dtype=bool)
mask[missing_rows, :] = 1
mask[:, missing_cols] = 1
return mask
[docs]def zero_pad_sparse(mat, margin_h, margin_v, fmt="coo"):
"""
Adds margin of zeros around an input sparse matrix.
Parameters
----------
mat : scipy.sparse.csr_matrix
The matrix to be padded.
margin_h : int
The width of the horizontal margin to add on the left and right of the
matrix.
margin_v : int
The width of the vertical margin to add on the top and bottom of the
matrix.
fmt : string
The desired scipy sparse format of the output matrix
Returns
-------
scipy.sparse.csr_matrix :
The padded matrix of dimensions (m + 2 * margin_h, n + 2 * margin_v).
Examples
--------
>>> m = sp.csr_matrix(np.array([[1, 2], [10, 20]]))
>>> zero_pad_sparse(m, 2, 1).toarray()
array([[ 0, 0, 0, 0, 0, 0],
[ 0, 0, 1, 2, 0, 0],
[ 0, 0, 10, 20, 0, 0],
[ 0, 0, 0, 0, 0, 0]])
"""
sm, sn = mat.shape
padded_mat = mat.copy()
# Up and down margins initialized with zeros and filled as needed
margin_h_0 = sp.csr_matrix((sm, margin_h), dtype=mat.dtype)
margin_v_0 = sp.csr_matrix((margin_v, sn + 2 * margin_h), dtype=mat.dtype)
padded_mat = sp.hstack([margin_h_0, padded_mat, margin_h_0], format="csr")
padded_mat = sp.vstack([margin_v_0, padded_mat, margin_v_0], format="csr")
return padded_mat
[docs]def crop_kernel(kernel, target_size):
"""
Crop a kernel matrix to target size horizontally and vertically.
If the target size is even, the target size is adjusted to the
next integer up.
Parameters
----------
kernel : numpy.ndarray of floats
Image to crop.
target_size : tuple of ints
Tuple defining the target shape of the kernel, takes the
form (rows, cols) where rows and cols are odd numbers.
Returns
-------
cropped : numpy.ndarray of floats
New image no larger than target dimensions
"""
# Use list for mutability
target = [d for d in target_size]
adjusted = False
for dim in range(len(target)):
if not target[dim] % 2:
target[dim] += 1
adjusted = True
if adjusted:
sys.stderr.write(
"WARNING: Cropped kernel size adjusted to "
f"{target[0]}x{target[1]} to keep odd dimensions.\n"
)
source_m, source_n = kernel.shape
target_m, target_n = target
# Define horizontal and vertical margins to trim
if source_m > target_m:
margin_rows = (source_m - target_m) // 2
else:
margin_rows = 0
if source_n > target_n:
margin_cols = (source_n - target_n) // 2
else:
margin_cols = 0
cropped = kernel[
margin_rows : (source_m - margin_rows),
margin_cols : (source_n - margin_cols),
]
return cropped
[docs]def resize_kernel(
kernel,
kernel_res=None,
signal_res=None,
factor=None,
min_size=7,
quiet=False,
):
"""
Resize a kernel matrix based on the resolution at which it was defined and
the signal resolution. E.g. if a kernel matrix was generated for 10kb and
the input signal is 20kb, kernel size will be divided by two. If the kernel
is enlarged, pixels are interpolated with a spline of degree 1.
Alternatively, a resize factor can be provided. In the example above, the
factor would be 0.5.
Parameters
----------
kernel : numpy.ndarray
Kernel matrix.
kernel_res : int
Resolution for which the kernel was designed. Mutually exclusive with
factor.
signal_res : int
Resolution of the signal matrix in basepair per matrix bin. Mutually
exclusive with factor.
factor : float
Resize factor. Can be provided as an alternative to kernel_res and
signal_res. Values above 1 will enlarge the kernel, values below 1 will
shrink it.
min_size : int
Lower bound, in number of rows/column allowed when resizing the kernel.
quiet : bool
Suppress warnings if resize factor was adjusted.
Returns
-------
resized_kernel : numpy.ndarray
The resized input kernel.
"""
km, kn = kernel.shape
if km != kn:
raise ValueError("kernel must be square.")
if not (km % 2) or not (kn % 2):
raise ValueError("kernel size must be odd.")
if factor is not None:
if kernel_res is not None or signal_res is not None:
raise ValueError(
"factor is mutually exclusive with resolution "
"parameters (kernel_res and signal_res)."
)
resize_factor = factor
else:
if kernel_res is None or signal_res is None:
raise ValueError(
"You must provide either a resize factor or the signal and "
"kernel resolutions."
)
# Define by how many times kernel must be enlarged for its pixels to
# match the signal's pixels
resize_factor = kernel_res / signal_res
if km * resize_factor < min_size:
resize_factor = min_size / km
resized_kernel = ndi.zoom(kernel, resize_factor, order=1)
if not resized_kernel.shape[0] % 2:
# Compute the factor required to yield a dimension smaller by one
adj_resize_factor = (resized_kernel.shape[0] - 1) / km
if not quiet:
sys.stderr.write(
f"Adjusting resize factor from {resize_factor} to {adj_resize_factor}.\n"
)
resized_kernel = ndi.zoom(kernel, adj_resize_factor, order=1)
return resized_kernel
[docs]def factorise_kernel(kernel, prop_info=0.999):
"""
Performs truncated SVD on an input kernel, returning the singular vectors
necessary to retain a given proportion of information contained in the
kernel.
Parameters
----------
kernel : numpy.ndarray of floats
The input 2D kernel to factorise.
prop_info : float
Proportion of information to retain.
Returns
-------
tuple of numpy.ndarrays of floats
A tuple containing the truncated left and right singular matrices,
where each singular vector has been multiplied by the square root of
their respective singular values.
"""
u, sigma, v = la.svd(kernel)
total_info = np.sum(sigma ** 2)
# Compute min. number of singular vectors to retain enough info
keep_k = np.flatnonzero(np.cumsum(sigma ** 2) > prop_info * total_info)[0] + 1
if keep_k > np.floor(min(kernel.shape) / 2):
sys.stderr.write(
f"Warning: Kernel factorisation required {keep_k} singular,"
"vectors this may result in slow operations.\n",
)
# Truncate singular matrix to the keep only required vectors
u = u[:, :keep_k]
v = v[:keep_k, :]
# Multiply each singular vector by the sqrt of its singular value
for i in range(keep_k):
u[:, i] *= np.sqrt(sigma[i])
v[i, :] *= np.sqrt(sigma[i])
return (u, v)
[docs]def valid_to_missing(valid, size):
"""
Given an array of valid indices, return the corrsesponding array of missing
indices.
Parameters
---------
valid : numpy.ndarray of ints
The valid indices.
size : int
The size of the matrix (maximum possible index + 1).
Return
------
missing : numpy.ndarray of ints
The missing indices.
"""
missing = np.ones(size, dtype=bool)
try:
missing[valid] = False
# In case there is no valid index
except IndexError:
pass
missing = np.flatnonzero(missing)
return missing