001    /*
002     * Copyright (c) 1998-2014 ChemAxon Ltd. All Rights Reserved.
003     *
004     * This software is the confidential and proprietary information of
005     * ChemAxon. You shall not disclose such Confidential Information
006     * and shall use it only in accordance with the terms of the agreements
007     * you entered into with ChemAxon.
008     *
009     */
010    
011    package com.chemaxon.clustering.sphex;
012    
013    import chemaxon.license.LicenseHandler;
014    import chemaxon.license.LicenseManager;
015    import com.chemaxon.calculations.common.ProgressObserver;
016    import com.chemaxon.calculations.common.SubProgressObserver;
017    import com.chemaxon.clustering.common.DissimilarityInput;
018    import com.chemaxon.clustering.common.IDBasedClusterBuilder;
019    import com.chemaxon.clustering.common.IDBasedSingleLevelClustering;
020    import com.chemaxon.common.annotations.PublicAPI;
021    import com.google.common.annotations.Beta;
022    import com.google.common.base.Optional;
023    import java.util.ArrayList;
024    import java.util.Arrays;
025    import java.util.List;
026    import java.util.Random;
027    import java.util.concurrent.CancellationException;
028    import org.apache.commons.logging.Log;
029    import org.apache.commons.logging.LogFactory;
030    
031    /**
032     * Core functionality related to sphere exclusion clustering.
033     *
034     * <p>Licensing: this class is part of the JKlustor suite; it can be used with valid
035     * {@link LicenseManager#JKLUSTOR JKlustor} license.</p>
036     *
037     * <p>Please note that this class is marked with @Beta annotation, so it can be subject of incompatible changes or
038     * removal in later releases.</p>
039     *
040     * @author Gabor Imre
041     * @author Adrian Kalaszi
042     */
043    // TODO: consider the max possible radius through dissimilarityInput
044    // TODO: sample median based initial guess (calculate full inner ditances of a sample, consider median value
045    // TODO: hierarchic indexing to speed up nearest centroid location
046    
047    @Beta
048    @PublicAPI
049    public final class SphereExclusion {
050    
051        /**
052         * Logger to use
053         */
054        private static final Log LOG = LogFactory.getLog(SphereExclusion.class);
055    
056        /**
057         * No constructor for utility classes.
058         */
059        private SphereExclusion() {
060        }
061    
062        /**
063         * Initial guess on sphere radius
064         *
065         * @author Adrian Kalaszi
066         */
067        private interface InitialGuess {
068            /**
069             * Guess a sphere exclusion radius based on a {@link DissimilarityInput}.
070             *
071             * @param di    Input space
072             * @return      Initial sphex readius, note that it can be zero
073             * @throws      IllegalArgumentException when less than 2 elements contained
074             */
075            double guess(DissimilarityInput di);
076        }
077    
078        /**
079         * Guess is the first two elements distance.
080         */
081        private static final class FirstTwoElements implements InitialGuess {
082    
083            @Override
084            public double guess(DissimilarityInput di) {
085                if (di.size() < 2) {
086                    throw new IllegalArgumentException("At least two element expected, got " + di.size());
087                }
088                return di.dissimilarity(0, 1);
089            }
090    
091        }
092    
093        /**
094         * Guess is the distance of two random elements.
095         */
096        private static final class TwoRandomElements implements InitialGuess {
097    
098            @Override
099            public double guess(DissimilarityInput di) {
100                if (di.size() < 2) {
101                    throw new IllegalArgumentException("At least two element expected, got " + di.size());
102                }
103                final Random r = new Random();
104                final int first = r.nextInt(di.size());
105                int second = first;
106                while (second == first) {
107                    second = r.nextInt(di.size());
108                }
109                return di.dissimilarity(first, second);
110            }
111    
112        }
113    
114        /**
115         * Sphere exclusion clustering with defined sphere radius.
116         *
117         * <p>The returned clustring will reflect the given sphere radius, regardless of the found cluster count. If finer
118         * control is required over maximal cluster count then do centroid identification and classification steps
119         * separately.</p>
120         *
121         * @param radius       Sphere radius
122         * @param di           input source
123         * @param po           progress observer to update
124         *
125         * @return the clustering result
126         *
127         * @throws CancellationException if cancelled by the progress observer
128         * @throws IllegalArgumentException on empty input or non positive radius
129         * @throws chemaxon.license.LicenseException when appropriate license is not available
130         */
131        public static IDBasedSingleLevelClustering fixedSPHEX(
132                double radius, DissimilarityInput di, ProgressObserver po) throws CancellationException {
133    
134            // Check licenses
135            LicenseHandler.getInstance().checkLicense(LicenseManager.JKLUSTOR);
136    
137            if (radius <= 0.0) {
138                throw new IllegalArgumentException("Positive radius expected, got " + radius);
139            }
140    
141            if (di.size() == 0) {
142                throw new IllegalArgumentException("Empty input.");
143            }
144    
145            final SubProgressObserver spo1 = po.subTask("Identify centroids", 1);
146            final int [] centroids;
147            try {
148                centroids = sphexCentroids(radius, Integer.MAX_VALUE, di, spo1).get();
149            } finally {
150                spo1.done();
151            }
152    
153            final SubProgressObserver spo2 = po.subTask("Classify input", 1);
154            try {
155                return sphexClassify(centroids, di, spo2);
156            } finally {
157                spo2.done();
158            }
159    
160        }
161    
162        /**
163         * Sphere exclusion clustering with sphere radius inference.
164         *
165         * <p>The returned cluster count is expected between the minimum and maximum expected value. This is aimed to
166         * be reached through optimization of sphere exclusion radius value. If minimum cluster count is greater than
167         * the represented input count its value will be lowered. Please note that it can not be guaranteed that
168         * resulting cluster count will fall between the given (or updated) limits.</p>
169         *
170         * @param initialMinCC minimum expected cluster count inclusive
171         * @param maxCC        maximum expected cluster count inclusive
172         * @param di           input source
173         * @param po           progress observer to update
174         *
175         * @return Input item ID based clustering; using centroids as representants. Note that the resulting cluster count
176         *         can fall outside the expected cluster count range.
177         *
178         * @throws CancellationException if cancelled by the progress observer
179         * @throws IllegalArgumentException on empty input, or when cluster counts are invalid
180         * @throws chemaxon.license.LicenseException when appropriate license is not available
181         */
182        public static IDBasedSingleLevelClustering adaptiveSPHEX(
183                int initialMinCC, int maxCC, DissimilarityInput di, ProgressObserver po) throws CancellationException {
184    
185            // Check licenses
186            LicenseHandler.getInstance().checkLicense(LicenseManager.JKLUSTOR);
187    
188            if (di.size() == 0) {
189                throw new IllegalArgumentException("Empty list as input.");
190            }
191    
192            if (di.size() == 1) {
193                if (LOG.isDebugEnabled()) {
194                    LOG.debug("Single element list as input.");
195                }
196                final IDBasedClusterBuilder b = new IDBasedClusterBuilder();
197                b.addStructureToCluster(0, 0);
198                b.updateRepresentant(0, 0);
199                return b.build();
200            }
201    
202            if (maxCC < initialMinCC) {
203                throw new IllegalArgumentException("Minimum requested cluster count " + initialMinCC
204                        + " should not be greater than requested maximum cluster count " + maxCC);
205            }
206    
207            if (maxCC < 0) {
208                throw new IllegalArgumentException("Maximum requested cluster count " + maxCC
209                        + " should be a positive number.");
210            }
211    
212            // Optimization: cluster count can not be greater than the input size
213            // minimal cluster count to consider
214            final int minCC;
215            if (initialMinCC > di.size()) {
216                //minCC = di.size() > 1 ? di.size() / 2 : 1;
217                // Be consistent when this optimization kicks in
218                minCC = di.size();
219                if (LOG.isDebugEnabled()) {
220                    LOG.debug("Too few element in the input. Minimum requested cluster count " + initialMinCC
221                            + " will be changed to " + minCC);
222                }
223            } else {
224                minCC = initialMinCC;
225            }
226    
227            // estimate an initial cluster radius
228            final InitialGuess initial = new FirstTwoElements();
229            double r = initial.guess(di);
230            if (r == 0.0) {
231                if (LOG.isDebugEnabled()) {
232                    LOG.debug("Initial guess returned zero radius, will using 1.0");
233                }
234                r = 1.0;
235            }
236    
237            // Number of valid centroid sets found before return
238            int maxValidCount = 10;
239    
240            int validCentroidCount = 0;
241            int[] bestValidCentroid = null;
242    
243    
244            // upper valid radius limit
245            // if present radius values equal or greater are known to be invalid
246            Optional<Double> upperRadiusLimit = Optional.<Double>absent();
247    
248            // lower valid radius limit
249            // if presemt radius values equal or less are known to be incalid
250            Optional<Double> lowerRadiusLimit = Optional.<Double>absent();
251    
252    
253    
254            if (LOG.isDebugEnabled()) {
255                LOG.debug("Max centroid valid count: " + maxValidCount);
256            }
257            while (validCentroidCount < maxValidCount) {
258    
259    
260                if (LOG.isTraceEnabled()) {
261                    LOG.trace("Try r: " + r + "(known lower r limit: " + lowerRadiusLimit + "; known upper limit: "
262                            + upperRadiusLimit);
263                }
264    
265                Optional<int[]> centroids;
266                SubProgressObserver spo = po.subTask("Identify sphere exclusion centroids for radius " + r, di.size());
267                try {
268                    centroids = sphexCentroids(r, maxCC, di, spo);
269                } finally {
270                    spo.done();
271                }
272    
273                if (po.isCancelled()) {
274                    throw new CancellationException("Calculation was cancelled.");
275                }
276    
277                if (centroids.isPresent()) {
278                    if (LOG.isTraceEnabled()) {
279                        LOG.trace("Found centroid count: " + centroids.get().length);
280                    }
281    
282    
283                    final int[] c = centroids.get();
284                    assert (c.length <= maxCC);
285                    if (LOG.isDebugEnabled()) {
286                        LOG.debug("Valid centroids found: " + Arrays.toString(c));
287                    }
288                    if (c.length >= minCC) {
289                        if (LOG.isDebugEnabled()) {
290                            LOG.debug("Valid centroids accepted");
291                        }
292                        spo = po.subTask("Assign structures to clusters", di.size());
293                        IDBasedSingleLevelClustering clusters;
294                        try {
295                            clusters = sphexClassify(c, di, spo);
296                        } finally {
297                            spo.done();
298                        }
299                        return clusters;
300                    }
301                    validCentroidCount++;
302                    if (bestValidCentroid == null || bestValidCentroid.length < c.length) {
303                        // the length of the currently found is closer to minCC
304                        bestValidCentroid = c.clone();
305                    }
306                    // radius is too big, too few clusters were generated
307                    // we know the current radius is an upper limit for the radiuses
308                    upperRadiusLimit = Optional.of(r);
309    
310                    // decrease radius
311                    if (lowerRadiusLimit.isPresent()) {
312                        // if we have a known lower limit, target the midpoint
313                        r = (r + lowerRadiusLimit.get()) / 2;
314                    } else {
315                        // no lower limit, decrease
316                        r /= 2.0;
317                    }
318    
319                } else {
320                    // the radius is too small, too many clusters were generated
321                    if (LOG.isTraceEnabled()) {
322                        LOG.trace("Too many clusters");
323                    }
324                    // it is known that current radius is too small
325                    lowerRadiusLimit = Optional.of(r);
326    
327                    // increase radius
328                    if (upperRadiusLimit.isPresent()) {
329                        // if we have upper limit, target the midpoint of the open interval
330                        r = (r + upperRadiusLimit.get()) / 2;
331                    } else {
332                        // no upper limit, increase
333                        r *= 3.0;
334                    }
335                }
336                po.worked(1);
337            }
338            assert (bestValidCentroid != null);
339            if (LOG.isDebugEnabled()) {
340                LOG.debug("Valid centroids count is not in the predefined limit returnig the best effort");
341            }
342            final SubProgressObserver spo = po.subTask("Assign structures to clusters", di.size());
343            try {
344                return sphexClassify(bestValidCentroid, di, spo);
345            } finally {
346                spo.done();
347            }
348    
349        }
350    
351        /**
352         * Identify SphereExclusion clustering based centroids for a given set.
353         *
354         * @param r     Sphere radius to use.
355         * @param maxCC Maximal allowed cluster count. If the given parameters results in more identified centroids the
356         *              algorithm immediately terminates and returns with an {@link Optional#absent()} value.
357         * @param di    Input dissimilarity space
358         * @param po    Associated progress observer
359         *
360         * @return      A list of found centroid indexes if maximum cluster count is not exceeded
361         *
362         * @throws CancellationException    When termination is due to cancel request from the progress observer
363         * @throws chemaxon.license.LicenseException when appropriate license is not available
364         */
365        public static Optional<int[]> sphexCentroids(
366                double r, int maxCC, DissimilarityInput di, ProgressObserver po) throws CancellationException {
367    
368            // Check licenses
369            LicenseHandler.getInstance().checkLicense(LicenseManager.JKLUSTOR);
370    
371            // check input conditions
372            if (di.size() == 0) {
373                throw new IllegalArgumentException("Empty input is invalid.");
374            }
375    
376            if (LOG.isTraceEnabled()) {
377                LOG.trace("sphexCentroids; r: " + r + " maxCC: " + maxCC + " structures: " + di.size());
378            }
379    
380            // use try-finally to ensure progressObserver closure
381            // assign a work unit per input structure
382            po.switchToDeterminate(di.size());
383    
384            // identified centroids
385            final List<Integer> centroids = new ArrayList<Integer>();
386    
387            // by definition first item will be a centroid
388            centroids.add(0);
389    
390            // classify other (than first) items as centroids or not
391            for (int i = 1; i < di.size(); i++) {
392    
393                // look for a nearby centroid
394                boolean skip = false;
395                for (int j = 0; j < centroids.size(); j++) {
396                    if (di.dissimilarity(centroids.get(j), i) <= r) {
397                        // found; current item is not a new centroid
398                        skip = true;
399                        break;
400                    }
401                }
402                if (!skip) {
403                    // new centroid found; add
404                    centroids.add(i);
405                    // check cluster count requirement
406                    if (centroids.size() > maxCC) {
407                        if (LOG.isTraceEnabled()) {
408                            LOG.trace("At structure index " + i + " found " + centroids.size() + " centroids, exiting");
409                        }
410                        return Optional.<int[]>absent();
411                    }
412                }
413                // report work
414                po.worked(1);
415                // check cancellation
416                if (po.isCancelled()) {
417                    throw new CancellationException();
418                }
419            }
420    
421            final int[] ret = new int[centroids.size()];
422            for (int i = 0; i < ret.length; i++) {
423                ret[i] = centroids.get(i);
424            }
425            if (LOG.isTraceEnabled()) {
426                LOG.trace("Found " + centroids.size() + " centroids");
427            }
428            return Optional.of(ret);
429        }
430    
431        /**
432         * Classify input items into clusters determined by nearest centroids.
433         *
434         * @param centroids     Centroid indexes
435         * @param di            Input dissimilarity space
436         * @param po            Associated progress observer
437         *
438         * @return              Input item ID based clustering; using centroids as representants
439         *
440         * @throws CancellationException    When termination is due to cancel request from the progress observer
441         * * @throws chemaxon.license.LicenseException when appropriate license is not available
442         *
443         */
444        public static IDBasedSingleLevelClustering sphexClassify(
445                int[] centroids, DissimilarityInput di, ProgressObserver po) throws CancellationException {
446    
447            // Check licenses
448            LicenseHandler.getInstance().checkLicense(LicenseManager.JKLUSTOR);
449    
450            // check input
451            if (di.size() == 0) {
452                throw new IllegalArgumentException("Empty input is invalid.");
453            }
454            if (centroids.length == 0) {
455                throw new IllegalArgumentException("No centroid specified");
456            }
457    
458            // assign a work unit for every input structure
459            po.switchToDeterminate(di.size());
460    
461            // build output
462            final IDBasedClusterBuilder b = new IDBasedClusterBuilder();
463    
464            // classify inputs
465            for (int i = 0; i < di.size(); i++) {
466                // best cluster index found
467                int bestci = 0;
468                // best dissimilarity found
469                double bestdissim = 0;
470    
471                // look for the nearest centroid
472                for (int j = 0; j < centroids.length; j++) {
473                    final double d = di.dissimilarity(centroids[j], i);
474                    if (j == 0 || bestdissim > d) {
475                        bestdissim = d;
476                        bestci = j;
477                    }
478                }
479    
480                // register determined cluster membership and centroid as representant
481                b.addStructureToCluster(i, bestci);
482                b.updateRepresentant(centroids[bestci], bestci);
483    
484                // report work unit
485                po.worked(1);
486    
487                // check cancellation
488                if (po.isCancelled()) {
489                    throw new CancellationException();
490                }
491            }
492            // return built clusters
493            return b.build();
494    
495        }
496    }